aboutsummaryrefslogtreecommitdiff
path: root/test/CodeGen
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-07-13 19:25:18 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-07-13 19:25:18 +0000
commitca089b24d48ef6fa8da2d0bb8c25bb802c4a95c0 (patch)
tree3a28a772df9b17aef34f49e3c727965ad28c0c93 /test/CodeGen
parent9df3605dea17e84f8183581f6103bd0c79e2a606 (diff)
downloadsrc-ca089b24d48ef6fa8da2d0bb8c25bb802c4a95c0.tar.gz
src-ca089b24d48ef6fa8da2d0bb8c25bb802c4a95c0.zip
Vendor import of llvm trunk r307894:vendor/llvm/llvm-trunk-r307894
Notes
Notes: svn path=/vendor/llvm/dist/; revision=320957 svn path=/vendor/llvm/llvm-trunk-r307894/; revision=320958; tag=vendor/llvm/llvm-trunk-r307894
Diffstat (limited to 'test/CodeGen')
-rw-r--r--test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll8
-rw-r--r--test/CodeGen/AArch64/GlobalISel/select-implicit-def.mir30
-rw-r--r--test/CodeGen/AArch64/GlobalISel/select-intrinsic-aarch64-sdiv.mir38
-rw-r--r--test/CodeGen/AArch64/arm64-csldst-mmo.ll6
-rw-r--r--test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll4
-rw-r--r--test/CodeGen/AArch64/arm64-misched-memdep-bug.ll6
-rw-r--r--test/CodeGen/AArch64/fence-singlethread.ll2
-rw-r--r--test/CodeGen/AArch64/preferred-function-alignment.ll26
-rw-r--r--test/CodeGen/AArch64/tailcall_misched_graph.ll4
-rw-r--r--test/CodeGen/AMDGPU/add.i16.ll10
-rw-r--r--test/CodeGen/AMDGPU/add.ll18
-rw-r--r--test/CodeGen/AMDGPU/add.v2i16.ll4
-rw-r--r--test/CodeGen/AMDGPU/add_i128.ll16
-rw-r--r--test/CodeGen/AMDGPU/add_i64.ll8
-rw-r--r--test/CodeGen/AMDGPU/addrspacecast.ll33
-rw-r--r--test/CodeGen/AMDGPU/alignbit-pat.ll2
-rw-r--r--test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll38
-rw-r--r--test/CodeGen/AMDGPU/and-gcn.ll3
-rw-r--r--test/CodeGen/AMDGPU/and.ll55
-rw-r--r--test/CodeGen/AMDGPU/any_extend_vector_inreg.ll6
-rw-r--r--test/CodeGen/AMDGPU/bitreverse.ll20
-rw-r--r--test/CodeGen/AMDGPU/bswap.ll2
-rw-r--r--test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll8
-rw-r--r--test/CodeGen/AMDGPU/cgp-addressing-modes.ll6
-rw-r--r--test/CodeGen/AMDGPU/clamp-omod-special-case.mir46
-rw-r--r--test/CodeGen/AMDGPU/coalescer_remat.ll2
-rw-r--r--test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir187
-rw-r--r--test/CodeGen/AMDGPU/constant-fold-mi-operands.ll2
-rw-r--r--test/CodeGen/AMDGPU/copy-illegal-type.ll62
-rw-r--r--test/CodeGen/AMDGPU/ctlz.ll75
-rw-r--r--test/CodeGen/AMDGPU/ctlz_zero_undef.ll78
-rw-r--r--test/CodeGen/AMDGPU/ctpop.ll91
-rw-r--r--test/CodeGen/AMDGPU/ctpop64.ll29
-rw-r--r--test/CodeGen/AMDGPU/cttz_zero_undef.ll19
-rw-r--r--test/CodeGen/AMDGPU/cvt_f32_ubyte.ll88
-rw-r--r--test/CodeGen/AMDGPU/detect-dead-lanes.mir10
-rw-r--r--test/CodeGen/AMDGPU/ds_read2.ll4
-rw-r--r--test/CodeGen/AMDGPU/ds_read2_superreg.ll10
-rw-r--r--test/CodeGen/AMDGPU/ds_read2st64.ll6
-rw-r--r--test/CodeGen/AMDGPU/early-if-convert-cost.ll2
-rw-r--r--test/CodeGen/AMDGPU/early-if-convert.ll2
-rw-r--r--test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll11
-rw-r--r--test/CodeGen/AMDGPU/extractelt-to-trunc.ll14
-rw-r--r--test/CodeGen/AMDGPU/fabs.f16.ll14
-rw-r--r--test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll50
-rw-r--r--test/CodeGen/AMDGPU/fadd.f16.ll58
-rw-r--r--test/CodeGen/AMDGPU/fadd64.ll12
-rw-r--r--test/CodeGen/AMDGPU/fcanonicalize-elimination.ll487
-rw-r--r--test/CodeGen/AMDGPU/fcanonicalize.f16.ll18
-rw-r--r--test/CodeGen/AMDGPU/fcanonicalize.ll2
-rw-r--r--test/CodeGen/AMDGPU/fcmp.f16.ll312
-rw-r--r--test/CodeGen/AMDGPU/fcmp64.ll12
-rw-r--r--test/CodeGen/AMDGPU/fconst64.ll9
-rw-r--r--test/CodeGen/AMDGPU/fcopysign.f16.ll91
-rw-r--r--test/CodeGen/AMDGPU/fdiv.f16.ll6
-rw-r--r--test/CodeGen/AMDGPU/fdiv.ll41
-rw-r--r--test/CodeGen/AMDGPU/fma-combine.ll34
-rw-r--r--test/CodeGen/AMDGPU/fma.f64.ll4
-rw-r--r--test/CodeGen/AMDGPU/fma.ll4
-rw-r--r--test/CodeGen/AMDGPU/fmax_legacy.ll10
-rw-r--r--test/CodeGen/AMDGPU/fmed3.ll4
-rw-r--r--test/CodeGen/AMDGPU/fmin_legacy.ll10
-rw-r--r--test/CodeGen/AMDGPU/fmul.f16.ll22
-rw-r--r--test/CodeGen/AMDGPU/fmul64.ll4
-rw-r--r--test/CodeGen/AMDGPU/fmuladd.f16.ll28
-rw-r--r--test/CodeGen/AMDGPU/fmuladd.f32.ll82
-rw-r--r--test/CodeGen/AMDGPU/fmuladd.f64.ll12
-rw-r--r--test/CodeGen/AMDGPU/fmuladd.v2f16.ll18
-rw-r--r--test/CodeGen/AMDGPU/fneg-combines.ll62
-rw-r--r--test/CodeGen/AMDGPU/fneg-fabs.f16.ll4
-rw-r--r--test/CodeGen/AMDGPU/fneg-fabs.ll6
-rw-r--r--test/CodeGen/AMDGPU/fneg.f16.ll8
-rw-r--r--test/CodeGen/AMDGPU/fold-immediate-output-mods.mir53
-rw-r--r--test/CodeGen/AMDGPU/fold-operands-order.mir6
-rw-r--r--test/CodeGen/AMDGPU/fp32_to_fp16.ll6
-rw-r--r--test/CodeGen/AMDGPU/fpext.f16.ll8
-rw-r--r--test/CodeGen/AMDGPU/fptosi.f16.ll6
-rw-r--r--test/CodeGen/AMDGPU/fptoui.f16.ll6
-rw-r--r--test/CodeGen/AMDGPU/fptrunc.f16.ll12
-rw-r--r--test/CodeGen/AMDGPU/fract.f64.ll10
-rw-r--r--test/CodeGen/AMDGPU/fract.ll12
-rw-r--r--test/CodeGen/AMDGPU/frem.ll8
-rw-r--r--test/CodeGen/AMDGPU/fsqrt.f64.ll4
-rw-r--r--test/CodeGen/AMDGPU/fsqrt.ll6
-rw-r--r--test/CodeGen/AMDGPU/fsub.f16.ll28
-rw-r--r--test/CodeGen/AMDGPU/fsub.ll24
-rw-r--r--test/CodeGen/AMDGPU/fsub64.ll4
-rw-r--r--test/CodeGen/AMDGPU/ftrunc.f64.ll6
-rw-r--r--test/CodeGen/AMDGPU/global-extload-i16.ll4
-rw-r--r--test/CodeGen/AMDGPU/global-smrd-unknown.ll20
-rw-r--r--test/CodeGen/AMDGPU/half.ll10
-rw-r--r--test/CodeGen/AMDGPU/imm.ll4
-rw-r--r--test/CodeGen/AMDGPU/immv216.ll8
-rw-r--r--test/CodeGen/AMDGPU/indirect-addressing-si.ll8
-rw-r--r--test/CodeGen/AMDGPU/inline-asm.ll4
-rw-r--r--test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll2
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll2
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.class.ll2
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll2
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll2
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll4
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll4
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll4
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll4
-rw-r--r--test/CodeGen/AMDGPU/llvm.ceil.f16.ll8
-rw-r--r--test/CodeGen/AMDGPU/llvm.cos.f16.ll12
-rw-r--r--test/CodeGen/AMDGPU/llvm.exp2.f16.ll8
-rw-r--r--test/CodeGen/AMDGPU/llvm.floor.f16.ll8
-rw-r--r--test/CodeGen/AMDGPU/llvm.fma.f16.ll12
-rw-r--r--test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll24
-rw-r--r--test/CodeGen/AMDGPU/llvm.log2.f16.ll8
-rw-r--r--test/CodeGen/AMDGPU/llvm.maxnum.f16.ll22
-rw-r--r--test/CodeGen/AMDGPU/llvm.minnum.f16.ll22
-rw-r--r--test/CodeGen/AMDGPU/llvm.rint.f16.ll10
-rw-r--r--test/CodeGen/AMDGPU/llvm.round.ll4
-rw-r--r--test/CodeGen/AMDGPU/llvm.sin.f16.ll12
-rw-r--r--test/CodeGen/AMDGPU/llvm.sqrt.f16.ll8
-rw-r--r--test/CodeGen/AMDGPU/llvm.trunc.f16.ll8
-rw-r--r--test/CodeGen/AMDGPU/load-global-f32.ll10
-rw-r--r--test/CodeGen/AMDGPU/load-global-f64.ll6
-rw-r--r--test/CodeGen/AMDGPU/load-global-i16.ll10
-rw-r--r--test/CodeGen/AMDGPU/load-global-i32.ll8
-rw-r--r--test/CodeGen/AMDGPU/load-global-i64.ll10
-rw-r--r--test/CodeGen/AMDGPU/load-global-i8.ll10
-rw-r--r--test/CodeGen/AMDGPU/load-weird-sizes.ll10
-rw-r--r--test/CodeGen/AMDGPU/lower-mem-intrinsics.ll12
-rw-r--r--test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir227
-rw-r--r--test/CodeGen/AMDGPU/mad-combine.ll106
-rw-r--r--test/CodeGen/AMDGPU/madak.ll6
-rw-r--r--test/CodeGen/AMDGPU/madmk.ll4
-rw-r--r--test/CodeGen/AMDGPU/max.ll4
-rw-r--r--test/CodeGen/AMDGPU/merge-stores.ll4
-rw-r--r--test/CodeGen/AMDGPU/mubuf.ll2
-rw-r--r--test/CodeGen/AMDGPU/mul.ll6
-rw-r--r--test/CodeGen/AMDGPU/multi-divergent-exit-region.ll4
-rw-r--r--test/CodeGen/AMDGPU/no-shrink-extloads.ll2
-rw-r--r--test/CodeGen/AMDGPU/or.ll6
-rw-r--r--test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll2
-rw-r--r--test/CodeGen/AMDGPU/reduce-load-width-alignment.ll6
-rw-r--r--test/CodeGen/AMDGPU/regcoal-subrange-join.mir162
-rw-r--r--test/CodeGen/AMDGPU/reorder-stores.ll4
-rw-r--r--test/CodeGen/AMDGPU/rotl.i64.ll4
-rw-r--r--test/CodeGen/AMDGPU/rotr.i64.ll4
-rw-r--r--test/CodeGen/AMDGPU/rsq.ll8
-rw-r--r--test/CodeGen/AMDGPU/s_movk_i32.ll4
-rw-r--r--test/CodeGen/AMDGPU/sad.ll4
-rw-r--r--test/CodeGen/AMDGPU/saddo.ll6
-rw-r--r--test/CodeGen/AMDGPU/salu-to-valu.ll6
-rw-r--r--test/CodeGen/AMDGPU/scalar_to_vector.ll6
-rw-r--r--test/CodeGen/AMDGPU/schedule-global-loads.ll2
-rw-r--r--test/CodeGen/AMDGPU/scratch-buffer.ll4
-rw-r--r--test/CodeGen/AMDGPU/scratch-simple.ll6
-rw-r--r--test/CodeGen/AMDGPU/sdiv.ll6
-rw-r--r--test/CodeGen/AMDGPU/sdwa-peephole.ll24
-rw-r--r--test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll54
-rw-r--r--test/CodeGen/AMDGPU/select-vectors.ll6
-rw-r--r--test/CodeGen/AMDGPU/select.f16.ll63
-rw-r--r--test/CodeGen/AMDGPU/setcc-fneg-constant.ll6
-rw-r--r--test/CodeGen/AMDGPU/setcc.ll10
-rw-r--r--test/CodeGen/AMDGPU/sext-in-reg.ll8
-rw-r--r--test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll4
-rw-r--r--test/CodeGen/AMDGPU/sgpr-copy.ll4
-rw-r--r--test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll4
-rw-r--r--test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll2
-rw-r--r--test/CodeGen/AMDGPU/shift-i64-opts.ll4
-rw-r--r--test/CodeGen/AMDGPU/shl.ll4
-rw-r--r--test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir161
-rw-r--r--test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll2
-rw-r--r--test/CodeGen/AMDGPU/sign_extend.ll4
-rw-r--r--test/CodeGen/AMDGPU/sitofp.f16.ll4
-rw-r--r--test/CodeGen/AMDGPU/sminmax.ll26
-rw-r--r--test/CodeGen/AMDGPU/sminmax.v2i16.ll6
-rw-r--r--test/CodeGen/AMDGPU/spill-cfg-position.ll2
-rw-r--r--test/CodeGen/AMDGPU/sra.ll6
-rw-r--r--test/CodeGen/AMDGPU/srem.ll6
-rw-r--r--test/CodeGen/AMDGPU/srl.ll4
-rw-r--r--test/CodeGen/AMDGPU/ssubo.ll6
-rw-r--r--test/CodeGen/AMDGPU/sub.i16.ll10
-rw-r--r--test/CodeGen/AMDGPU/sub.ll4
-rw-r--r--test/CodeGen/AMDGPU/sub.v2i16.ll16
-rw-r--r--test/CodeGen/AMDGPU/syncscopes.ll19
-rw-r--r--test/CodeGen/AMDGPU/trunc-bitcast-vector.ll4
-rw-r--r--test/CodeGen/AMDGPU/trunc.ll6
-rw-r--r--test/CodeGen/AMDGPU/uaddo.ll10
-rw-r--r--test/CodeGen/AMDGPU/udiv.ll8
-rw-r--r--test/CodeGen/AMDGPU/uitofp.f16.ll4
-rw-r--r--test/CodeGen/AMDGPU/urem.ll6
-rw-r--r--test/CodeGen/AMDGPU/usubo.ll12
-rw-r--r--test/CodeGen/AMDGPU/v_cndmask.ll12
-rw-r--r--test/CodeGen/AMDGPU/v_mac.ll10
-rw-r--r--test/CodeGen/AMDGPU/v_mac_f16.ll38
-rw-r--r--test/CodeGen/AMDGPU/vectorize-global-local.ll2
-rw-r--r--test/CodeGen/AMDGPU/vop-shrink-frame-index.mir161
-rw-r--r--test/CodeGen/AMDGPU/vop-shrink-non-ssa.mir40
-rw-r--r--test/CodeGen/AMDGPU/vselect.ll25
-rw-r--r--test/CodeGen/AMDGPU/waitcnt-permute.mir12
-rw-r--r--test/CodeGen/AMDGPU/xor.ll8
-rw-r--r--test/CodeGen/AMDGPU/zext-i64-bit-operand.ll4
-rw-r--r--test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll24
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-instruction-select-cmp.mir1252
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll30
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir20
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir1612
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-legalizer.mir33
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir58
-rw-r--r--test/CodeGen/ARM/arguments-nosplit-double.ll1
-rw-r--r--test/CodeGen/ARM/arguments-nosplit-i64.ll1
-rw-r--r--test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll8
-rw-r--r--test/CodeGen/ARM/cortex-a57-misched-ldm.ll4
-rw-r--r--test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll2
-rw-r--r--test/CodeGen/ARM/cortex-a57-misched-vfma.ll28
-rw-r--r--test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll10
-rw-r--r--test/CodeGen/ARM/cortex-a57-misched-vldm.ll6
-rw-r--r--test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll2
-rw-r--r--test/CodeGen/ARM/fence-singlethread.ll2
-rw-r--r--test/CodeGen/ARM/ror.ll33
-rw-r--r--test/CodeGen/ARM/scavenging.mir66
-rw-r--r--test/CodeGen/AVR/branch-relaxation.ll96
-rw-r--r--test/CodeGen/AVR/ctlz.ll5
-rw-r--r--test/CodeGen/AVR/cttz.ll4
-rw-r--r--test/CodeGen/AVR/frmidx-iterator-bug.ll33
-rw-r--r--test/CodeGen/AVR/icall-func-pointer-correct-addr-space.ll15
-rw-r--r--test/CodeGen/AVR/pseudo/ANDIWRdK.mir6
-rw-r--r--test/CodeGen/AVR/pseudo/COMWRd.mir2
-rw-r--r--test/CodeGen/AVR/pseudo/ORIWRdK.mir2
-rw-r--r--test/CodeGen/AVR/pseudo/SBCIWRdK.mir2
-rw-r--r--test/CodeGen/AVR/pseudo/SUBIWRdK.mir2
-rw-r--r--test/CodeGen/AVR/select-mbb-placement-bug.ll6
-rw-r--r--test/CodeGen/BPF/undef.ll58
-rw-r--r--test/CodeGen/Generic/pr33094.ll18
-rw-r--r--test/CodeGen/Hexagon/convertdptoint.ll8
-rw-r--r--test/CodeGen/Hexagon/convertdptoll.ll4
-rw-r--r--test/CodeGen/Hexagon/convertsptoint.ll4
-rw-r--r--test/CodeGen/Hexagon/convertsptoll.ll4
-rw-r--r--test/CodeGen/Hexagon/dadd.ll8
-rw-r--r--test/CodeGen/Hexagon/dmul.ll8
-rw-r--r--test/CodeGen/Hexagon/doubleconvert-ieee-rnd-near.ll8
-rw-r--r--test/CodeGen/Hexagon/dsub.ll8
-rw-r--r--test/CodeGen/Hexagon/fadd.ll8
-rw-r--r--test/CodeGen/Hexagon/fmul.ll8
-rw-r--r--test/CodeGen/Hexagon/fsub.ll8
-rw-r--r--test/CodeGen/Hexagon/hasfp-crash1.ll82
-rw-r--r--test/CodeGen/Hexagon/hasfp-crash2.ll83
-rw-r--r--test/CodeGen/Hexagon/hvx-nontemporal.ll28
-rw-r--r--test/CodeGen/Hexagon/target-flag-ext.mir24
-rw-r--r--test/CodeGen/MIR/AArch64/atomic-memoperands.mir4
-rw-r--r--test/CodeGen/MIR/AArch64/invalid-target-memoperands.mir19
-rw-r--r--test/CodeGen/MIR/AArch64/target-memoperands.mir22
-rw-r--r--test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir20
-rw-r--r--test/CodeGen/MIR/AMDGPU/syncscopes.mir98
-rw-r--r--test/CodeGen/MIR/AMDGPU/target-flags.mir29
-rw-r--r--test/CodeGen/MIR/Generic/runPass.mir2
-rw-r--r--test/CodeGen/MIR/Hexagon/target-flags.mir36
-rw-r--r--test/CodeGen/MIR/X86/tied-physical-regs-match.mir22
-rw-r--r--test/CodeGen/MSP430/Inst16mm.ll4
-rw-r--r--test/CodeGen/NVPTX/lower-aggr-copies.ll61
-rw-r--r--test/CodeGen/PowerPC/PR33636.ll702
-rw-r--r--test/CodeGen/PowerPC/atomics-regression.ll528
-rw-r--r--test/CodeGen/PowerPC/bitreverse.ll23
-rw-r--r--test/CodeGen/PowerPC/build-vector-tests.ll4
-rw-r--r--test/CodeGen/PowerPC/ppc-ctr-dead-code.ll38
-rw-r--r--test/CodeGen/PowerPC/ppc-redzone-alignment-bug.ll32
-rw-r--r--test/CodeGen/PowerPC/ppc64le-smallarg.ll4
-rw-r--r--test/CodeGen/PowerPC/pr33093.ll165
-rw-r--r--test/CodeGen/PowerPC/select-addrRegRegOnly.ll37
-rw-r--r--test/CodeGen/PowerPC/svr4-redzone.ll6
-rw-r--r--test/CodeGen/PowerPC/tailcall1-64.ll7
-rw-r--r--test/CodeGen/PowerPC/testBitReverse.ll105
-rw-r--r--test/CodeGen/PowerPC/vec_extract_p9.ll167
-rw-r--r--test/CodeGen/PowerPC/vec_int_ext.ll253
-rw-r--r--test/CodeGen/PowerPC/vsx-partword-int-loads-and-stores.ll16
-rw-r--r--test/CodeGen/SystemZ/regalloc-fast-invalid-kill-flag.mir34
-rw-r--r--test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll22
-rw-r--r--test/CodeGen/WebAssembly/umulo-i64.ll21
-rw-r--r--test/CodeGen/X86/2012-08-16-setcc.ll42
-rw-r--r--test/CodeGen/X86/GC/badreadproto.ll2
-rw-r--r--test/CodeGen/X86/GC/badrootproto.ll2
-rw-r--r--test/CodeGen/X86/GC/badwriteproto.ll2
-rw-r--r--test/CodeGen/X86/GC/fat.ll2
-rw-r--r--test/CodeGen/X86/GC/outside.ll2
-rw-r--r--test/CodeGen/X86/GlobalISel/GV.ll63
-rw-r--r--test/CodeGen/X86/GlobalISel/add-vec.ll173
-rw-r--r--test/CodeGen/X86/GlobalISel/constant.ll9
-rw-r--r--test/CodeGen/X86/GlobalISel/ext-x86-64.ll2
-rw-r--r--test/CodeGen/X86/GlobalISel/ext.ll36
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-GV.mir31
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-ext.mir171
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-memop-scalar.mir110
-rw-r--r--test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll22
-rw-r--r--test/CodeGen/X86/GlobalISel/memop-scalar.ll20
-rw-r--r--test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir27
-rw-r--r--test/CodeGen/X86/GlobalISel/select-GV.mir99
-rw-r--r--test/CodeGen/X86/GlobalISel/select-constant.mir31
-rw-r--r--test/CodeGen/X86/GlobalISel/select-ext.mir64
-rw-r--r--test/CodeGen/X86/GlobalISel/select-unmerge-vec256.mir53
-rw-r--r--test/CodeGen/X86/GlobalISel/select-unmerge-vec512.mir74
-rw-r--r--test/CodeGen/X86/GlobalISel/x86_64-fallback.ll18
-rw-r--r--test/CodeGen/X86/avg.ll6
-rw-r--r--test/CodeGen/X86/avx-cmp.ll197
-rw-r--r--test/CodeGen/X86/avx-load-store.ll277
-rw-r--r--test/CodeGen/X86/avx-schedule.ll648
-rw-r--r--test/CodeGen/X86/avx-unpack.ll166
-rw-r--r--test/CodeGen/X86/avx-vinsertf128.ll118
-rw-r--r--test/CodeGen/X86/avx2-vbroadcast.ll12
-rw-r--r--test/CodeGen/X86/avx512-cmp.ll2
-rw-r--r--test/CodeGen/X86/avx512-insert-extract.ll26
-rw-r--r--test/CodeGen/X86/avx512-vec-cmp.ll53
-rw-r--r--test/CodeGen/X86/avx512vl-vec-cmp.ll925
-rw-r--r--test/CodeGen/X86/avx512vl-vec-masked-cmp.ll50906
-rw-r--r--test/CodeGen/X86/bitcast-and-setcc-128.ll156
-rw-r--r--test/CodeGen/X86/bitcast-and-setcc-256.ll104
-rw-r--r--test/CodeGen/X86/bitcast-and-setcc-512.ll1868
-rw-r--r--test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll3483
-rw-r--r--test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll3279
-rw-r--r--test/CodeGen/X86/bitcast-int-to-vector-bool.ll685
-rw-r--r--test/CodeGen/X86/bitcast-setcc-128.ll156
-rw-r--r--test/CodeGen/X86/bitcast-setcc-256.ll419
-rw-r--r--test/CodeGen/X86/bitcast-setcc-512.ll1377
-rw-r--r--test/CodeGen/X86/block-placement.ll101
-rw-r--r--test/CodeGen/X86/bool-simplify.ll129
-rw-r--r--test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll1991
-rw-r--r--test/CodeGen/X86/bswap-wide-int.ll4
-rw-r--r--test/CodeGen/X86/build-vector-128.ll23
-rw-r--r--test/CodeGen/X86/build-vector-256.ll29
-rw-r--r--test/CodeGen/X86/build-vector-512.ll20
-rw-r--r--test/CodeGen/X86/cast-vsel.ll2
-rw-r--r--test/CodeGen/X86/clear_upper_vector_element_bits.ll236
-rw-r--r--test/CodeGen/X86/cmov.ll205
-rw-r--r--test/CodeGen/X86/code_placement_cold_loop_blocks.ll5
-rw-r--r--test/CodeGen/X86/combine-avx-intrinsics.ll47
-rw-r--r--test/CodeGen/X86/combine-avx2-intrinsics.ll69
-rw-r--r--test/CodeGen/X86/combine-rotates.ll80
-rw-r--r--test/CodeGen/X86/combine-sse41-intrinsics.ll72
-rw-r--r--test/CodeGen/X86/constant-hoisting-bfi.ll52
-rw-r--r--test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll124
-rw-r--r--test/CodeGen/X86/extract-store.ll2
-rw-r--r--test/CodeGen/X86/extractelement-legalization-store-ordering.ll51
-rw-r--r--test/CodeGen/X86/fast-isel-abort-warm.ll19
-rw-r--r--test/CodeGen/X86/fast-isel-gc-intrinsics.ll57
-rw-r--r--test/CodeGen/X86/fastisel-softfloat.ll15
-rw-r--r--test/CodeGen/X86/fp128-i128.ll2
-rw-r--r--test/CodeGen/X86/gather-addresses.ll16
-rw-r--r--test/CodeGen/X86/half.ll1045
-rw-r--r--test/CodeGen/X86/illegal-bitfield-loadstore.ll251
-rw-r--r--test/CodeGen/X86/optimize-max-1.ll51
-rw-r--r--test/CodeGen/X86/optimize-max-2.ll26
-rw-r--r--test/CodeGen/X86/pr15309.ll50
-rw-r--r--test/CodeGen/X86/pr23603.ll27
-rw-r--r--test/CodeGen/X86/pr33715.ll16
-rw-r--r--test/CodeGen/X86/rdrand-x86_64.ll19
-rw-r--r--test/CodeGen/X86/rdrand.ll119
-rw-r--r--test/CodeGen/X86/rdseed-x86_64.ll19
-rw-r--r--test/CodeGen/X86/rdseed.ll66
-rw-r--r--test/CodeGen/X86/recip-fastmath.ll116
-rw-r--r--test/CodeGen/X86/recip-fastmath2.ll162
-rw-r--r--test/CodeGen/X86/regalloc-reconcile-broken-hints.ll2
-rw-r--r--test/CodeGen/X86/rotate4.ll104
-rw-r--r--test/CodeGen/X86/sbb.ll46
-rw-r--r--test/CodeGen/X86/select_const.ll113
-rw-r--r--test/CodeGen/X86/shift-codegen.ll42
-rw-r--r--test/CodeGen/X86/shift-folding.ll57
-rw-r--r--test/CodeGen/X86/shuffle-vs-trunc-256.ll313
-rw-r--r--test/CodeGen/X86/shuffle-vs-trunc-512.ll422
-rw-r--r--test/CodeGen/X86/sink-blockfreq.ll2
-rw-r--r--test/CodeGen/X86/sink-gep-before-mem-inst.ll25
-rw-r--r--test/CodeGen/X86/soft-fp-legal-in-HW-reg.ll55
-rw-r--r--test/CodeGen/X86/sse-schedule.ll248
-rw-r--r--test/CodeGen/X86/sse2-schedule.ll598
-rw-r--r--test/CodeGen/X86/sse3-schedule.ll48
-rw-r--r--test/CodeGen/X86/sse41-schedule.ll222
-rw-r--r--test/CodeGen/X86/sse42-schedule.ll38
-rw-r--r--test/CodeGen/X86/sse4a-schedule.ll95
-rw-r--r--test/CodeGen/X86/ssse3-schedule.ll74
-rw-r--r--test/CodeGen/X86/swizzle-avx2.ll73
-rw-r--r--test/CodeGen/X86/tbm_patterns.ll502
-rw-r--r--test/CodeGen/X86/vec-copysign.ll2
-rw-r--r--test/CodeGen/X86/vec_return.ll17
-rw-r--r--test/CodeGen/X86/vec_shift6.ll9
-rw-r--r--test/CodeGen/X86/vec_unsafe-fp-math.ll15
-rw-r--r--test/CodeGen/X86/vector-popcnt-128.ll93
-rw-r--r--test/CodeGen/X86/vector-popcnt-256.ll14
-rw-r--r--test/CodeGen/X86/vector-popcnt-512.ll120
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-sse4a.ll86
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-ssse3.ll15
-rw-r--r--test/CodeGen/X86/vector-shuffle-sse4a.ll129
-rw-r--r--test/CodeGen/X86/vector-truncate-combine.ll10
-rw-r--r--test/CodeGen/X86/vector-tzcnt-128.ll54
-rw-r--r--test/CodeGen/X86/vector-tzcnt-256.ll28
-rw-r--r--test/CodeGen/X86/vector-tzcnt-512.ll124
-rw-r--r--test/CodeGen/X86/wide-integer-cmp.ll2
-rw-r--r--test/CodeGen/X86/x32-lea-1.ll10
-rw-r--r--test/CodeGen/X86/x86-interleaved-access.ll229
-rw-r--r--test/CodeGen/X86/zext-shl.ll39
-rw-r--r--test/CodeGen/X86/zext-trunc.ll9
394 files changed, 74783 insertions, 11228 deletions
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index 50ad83feed85..10ce87c2a187 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -1328,16 +1328,16 @@ define void @test_load_store_atomics(i8* %addr) {
; CHECK: G_STORE [[V0]](s8), [[ADDR]](p0) :: (store monotonic 1 into %ir.addr)
; CHECK: [[V1:%[0-9]+]](s8) = G_LOAD [[ADDR]](p0) :: (load acquire 1 from %ir.addr)
; CHECK: G_STORE [[V1]](s8), [[ADDR]](p0) :: (store release 1 into %ir.addr)
-; CHECK: [[V2:%[0-9]+]](s8) = G_LOAD [[ADDR]](p0) :: (load singlethread seq_cst 1 from %ir.addr)
-; CHECK: G_STORE [[V2]](s8), [[ADDR]](p0) :: (store singlethread monotonic 1 into %ir.addr)
+; CHECK: [[V2:%[0-9]+]](s8) = G_LOAD [[ADDR]](p0) :: (load syncscope("singlethread") seq_cst 1 from %ir.addr)
+; CHECK: G_STORE [[V2]](s8), [[ADDR]](p0) :: (store syncscope("singlethread") monotonic 1 into %ir.addr)
%v0 = load atomic i8, i8* %addr unordered, align 1
store atomic i8 %v0, i8* %addr monotonic, align 1
%v1 = load atomic i8, i8* %addr acquire, align 1
store atomic i8 %v1, i8* %addr release, align 1
- %v2 = load atomic i8, i8* %addr singlethread seq_cst, align 1
- store atomic i8 %v2, i8* %addr singlethread monotonic, align 1
+ %v2 = load atomic i8, i8* %addr syncscope("singlethread") seq_cst, align 1
+ store atomic i8 %v2, i8* %addr syncscope("singlethread") monotonic, align 1
ret void
}
diff --git a/test/CodeGen/AArch64/GlobalISel/select-implicit-def.mir b/test/CodeGen/AArch64/GlobalISel/select-implicit-def.mir
new file mode 100644
index 000000000000..8604b2769ba3
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-implicit-def.mir
@@ -0,0 +1,30 @@
+# RUN: llc -O0 -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+ define void @implicit_def() { ret void }
+...
+
+---
+# CHECK-LABEL: name: implicit_def
+name: implicit_def
+legalized: true
+regBankSelected: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr32, preferred-register: '' }
+# CHECK-NEXT: - { id: 1, class: gpr32, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+
+# CHECK: body:
+# CHECK: [[DEF:%[0-9]+]] = IMPLICIT_DEF
+# CHECK: [[ADD:%[0-9]+]] = ADDWrr [[DEF]], [[DEF]]
+# CHECK: %w0 = COPY [[ADD]]
+body: |
+ bb.0:
+ %0(s32) = G_IMPLICIT_DEF
+ %1(s32) = G_ADD %0, %0
+ %w0 = COPY %1(s32)
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select-intrinsic-aarch64-sdiv.mir b/test/CodeGen/AArch64/GlobalISel/select-intrinsic-aarch64-sdiv.mir
new file mode 100644
index 000000000000..43e682c6b6ca
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-intrinsic-aarch64-sdiv.mir
@@ -0,0 +1,38 @@
+# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+ define void @sdiv_s32_gpr() { ret void }
+...
+
+---
+# Check that we select a 32-bit GPR sdiv intrinsic into SDIVWrr for GPR32.
+# Also check that we constrain the register class of the COPY to GPR32.
+# CHECK-LABEL: name: sdiv_s32_gpr
+name: sdiv_s32_gpr
+legalized: true
+regBankSelected: true
+
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr32, preferred-register: '' }
+# CHECK-NEXT: - { id: 1, class: gpr32, preferred-register: '' }
+# CHECK-NEXT: - { id: 2, class: gpr32, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+
+# CHECK: body:
+# CHECK: %0 = COPY %w0
+# CHECK: %1 = COPY %w1
+# CHECK: %2 = SDIVWr %0, %1
+body: |
+ bb.0:
+ liveins: %w0, %w1
+
+ %0(s32) = COPY %w0
+ %1(s32) = COPY %w1
+ %2(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.sdiv.i32), %0, %1
+ %w0 = COPY %2(s32)
+...
diff --git a/test/CodeGen/AArch64/arm64-csldst-mmo.ll b/test/CodeGen/AArch64/arm64-csldst-mmo.ll
index cfb8e3a38c49..37cc5411aa31 100644
--- a/test/CodeGen/AArch64/arm64-csldst-mmo.ll
+++ b/test/CodeGen/AArch64/arm64-csldst-mmo.ll
@@ -13,9 +13,9 @@
; CHECK: SU(2): STRWui %WZR
; CHECK: SU(3): %X21<def>, %X20<def> = LDPXi %SP
; CHECK: Predecessors:
-; CHECK-NEXT: out SU(0)
-; CHECK-NEXT: out SU(0)
-; CHECK-NEXT: ord SU(0)
+; CHECK-NEXT: SU(0): Out
+; CHECK-NEXT: SU(0): Out
+; CHECK-NEXT: SU(0): Ord
; CHECK-NEXT: Successors:
define void @test1() {
entry:
diff --git a/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
index cde62fcb3f95..ad4feef7280f 100644
--- a/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
+++ b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
@@ -8,8 +8,8 @@
; CHECK: shiftable
; CHECK: SU(2): %vreg2<def> = SUBXri %vreg1, 20, 0
; CHECK: Successors:
-; CHECK-NEXT: data SU(4): Latency=1 Reg=%vreg2
-; CHECK-NEXT: data SU(3): Latency=2 Reg=%vreg2
+; CHECK-NEXT: SU(4): Data Latency=1 Reg=%vreg2
+; CHECK-NEXT: SU(3): Data Latency=2 Reg=%vreg2
; CHECK: ********** INTERVALS **********
define i64 @shiftable(i64 %A, i64 %B) {
%tmp0 = sub i64 %B, 20
diff --git a/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll b/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll
index 748a4762d82f..9cbf0cb3803a 100644
--- a/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll
+++ b/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll
@@ -7,11 +7,11 @@
; CHECK: misched_bug:BB#0 entry
; CHECK: SU(2): %vreg2<def> = LDRWui %vreg0, 1; mem:LD4[%ptr1_plus1] GPR32:%vreg2 GPR64common:%vreg0
; CHECK: Successors:
-; CHECK-NEXT: data SU(5): Latency=4 Reg=%vreg2
-; CHECK-NEXT: ord SU(4): Latency=0
+; CHECK-NEXT: SU(5): Data Latency=4 Reg=%vreg2
+; CHECK-NEXT: SU(4): Ord Latency=0
; CHECK: SU(3): STRWui %WZR, %vreg0, 0; mem:ST4[%ptr1] GPR64common:%vreg0
; CHECK: Successors:
-; CHECK: ord SU(4): Latency=0
+; CHECK: SU(4): Ord Latency=0
; CHECK: SU(4): STRWui %WZR, %vreg1, 0; mem:ST4[%ptr2] GPR64common:%vreg1
; CHECK: SU(5): %W0<def> = COPY %vreg2; GPR32:%vreg2
; CHECK: ** ScheduleDAGMI::schedule picking next node
diff --git a/test/CodeGen/AArch64/fence-singlethread.ll b/test/CodeGen/AArch64/fence-singlethread.ll
index 2ed744277385..0af0e58a91d4 100644
--- a/test/CodeGen/AArch64/fence-singlethread.ll
+++ b/test/CodeGen/AArch64/fence-singlethread.ll
@@ -16,6 +16,6 @@ define void @fence_singlethread() {
; IOS: ; COMPILER BARRIER
; IOS-NOT: dmb
- fence singlethread seq_cst
+ fence syncscope("singlethread") seq_cst
ret void
}
diff --git a/test/CodeGen/AArch64/preferred-function-alignment.ll b/test/CodeGen/AArch64/preferred-function-alignment.ll
new file mode 100644
index 000000000000..88e6f5dd01c9
--- /dev/null
+++ b/test/CodeGen/AArch64/preferred-function-alignment.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=generic < %s | FileCheck --check-prefix=ALIGN2 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a35 < %s | FileCheck --check-prefix=ALIGN2 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a53 < %s | FileCheck --check-prefix=ALIGN2 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a73 < %s | FileCheck --check-prefix=ALIGN2 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cyclone < %s | FileCheck --check-prefix=ALIGN2 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=falkor < %s | FileCheck --check-prefix=ALIGN2 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=kryo < %s | FileCheck --check-prefix=ALIGN2 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=thunderx < %s | FileCheck --check-prefix=ALIGN3 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=thunderxt81 < %s | FileCheck --check-prefix=ALIGN3 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=thunderxt83 < %s | FileCheck --check-prefix=ALIGN3 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=thunderxt88 < %s | FileCheck --check-prefix=ALIGN3 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 < %s | FileCheck --check-prefix=ALIGN3 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 < %s | FileCheck --check-prefix=ALIGN4 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a72 < %s | FileCheck --check-prefix=ALIGN4 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=exynos-m1 < %s | FileCheck --check-prefix=ALIGN4 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=exynos-m2 < %s | FileCheck --check-prefix=ALIGN4 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=exynos-m3 < %s | FileCheck --check-prefix=ALIGN4 %s
+
+define void @test() {
+ ret void
+}
+
+; CHECK-LABEL: test
+; ALIGN2: .p2align 2
+; ALIGN3: .p2align 3
+; ALIGN4: .p2align 4
diff --git a/test/CodeGen/AArch64/tailcall_misched_graph.ll b/test/CodeGen/AArch64/tailcall_misched_graph.ll
index 4fbd8944f032..7e76dac214a1 100644
--- a/test/CodeGen/AArch64/tailcall_misched_graph.ll
+++ b/test/CodeGen/AArch64/tailcall_misched_graph.ll
@@ -37,8 +37,8 @@ declare void @callee2(i8*, i8*, i8*, i8*, i8*,
; CHECK: SU({{.*}}): [[VRB]]<def> = LDRXui <fi#-2>
; CHECK-NOT: SU
; CHECK: Successors:
-; CHECK: ord SU([[DEPSTOREB:.*]]): Latency=0
-; CHECK: ord SU([[DEPSTOREA:.*]]): Latency=0
+; CHECK: SU([[DEPSTOREB:.*]]): Ord Latency=0
+; CHECK: SU([[DEPSTOREA:.*]]): Ord Latency=0
; CHECK: SU([[DEPSTOREA]]): STRXui %vreg{{.*}}, <fi#-4>
; CHECK: SU([[DEPSTOREB]]): STRXui %vreg{{.*}}, <fi#-3>
diff --git a/test/CodeGen/AMDGPU/add.i16.ll b/test/CodeGen/AMDGPU/add.i16.ll
index bee13d8c17f1..98848295a73b 100644
--- a/test/CodeGen/AMDGPU/add.i16.ll
+++ b/test/CodeGen/AMDGPU/add.i16.ll
@@ -4,7 +4,7 @@
; GCN-LABEL: {{^}}v_test_add_i16:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI-NEXT: buffer_store_short [[ADD]]
define amdgpu_kernel void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -67,7 +67,7 @@ define amdgpu_kernel void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i1
; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i32:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI-NEXT: buffer_store_dword [[ADD]]
define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -86,7 +86,7 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i1
; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
+; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]]
; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:{{[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -105,7 +105,7 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i1
; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i32:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
; VI-NEXT: buffer_store_dword [[SEXT]]
define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
@@ -125,7 +125,7 @@ define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i1
; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i64:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
diff --git a/test/CodeGen/AMDGPU/add.ll b/test/CodeGen/AMDGPU/add.ll
index 7e4546d2cfb3..6dcd7c234dc6 100644
--- a/test/CodeGen/AMDGPU/add.ll
+++ b/test/CodeGen/AMDGPU/add.ll
@@ -5,9 +5,9 @@
;FUNC-LABEL: {{^}}test1:
;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;SI: v_add_i32_e32 [[REG:v[0-9]+]], vcc, {{v[0-9]+, v[0-9]+}}
-;SI-NOT: [[REG]]
-;SI: buffer_store_dword [[REG]],
+;SI: s_add_i32 s[[REG:[0-9]+]], {{s[0-9]+, s[0-9]+}}
+;SI: v_mov_b32_e32 v[[REG]], s[[REG]]
+;SI: buffer_store_dword v[[REG]],
define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
%b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
%a = load i32, i32 addrspace(1)* %in
@@ -21,8 +21,8 @@ define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
+;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
@@ -39,10 +39,10 @@ define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspa
;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
+;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/AMDGPU/add.v2i16.ll b/test/CodeGen/AMDGPU/add.v2i16.ll
index 76f724c2b90b..4baa35ca57c5 100644
--- a/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -168,10 +168,10 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(
; VI: flat_load_ushort v[[B_HI:[0-9]+]]
; VI: flat_load_ushort v[[B_LO:[0-9]+]]
-; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
+; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
; VI-NOT: and
; VI-NOT: shl
-; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
+; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
; VI-NOT: and
; VI-NOT: shl
; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
diff --git a/test/CodeGen/AMDGPU/add_i128.ll b/test/CodeGen/AMDGPU/add_i128.ll
index 00a125c2e44f..d33965d4dda7 100644
--- a/test/CodeGen/AMDGPU/add_i128.ll
+++ b/test/CodeGen/AMDGPU/add_i128.ll
@@ -19,10 +19,10 @@ define amdgpu_kernel void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128
; Check that the SGPR add operand is correctly moved to a VGPR.
; GCN-LABEL: {{^}}sgpr_operand:
-; GCN: v_add_i32
-; GCN: v_addc_u32
-; GCN: v_addc_u32
-; GCN: v_addc_u32
+; GCN: s_add_u32
+; GCN: s_addc_u32
+; GCN: s_addc_u32
+; GCN: s_addc_u32
define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
%foo = load i128, i128 addrspace(1)* %in, align 8
%result = add i128 %foo, %a
@@ -31,10 +31,10 @@ define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 ad
}
; GCN-LABEL: {{^}}sgpr_operand_reversed:
-; GCN: v_add_i32
-; GCN: v_addc_u32
-; GCN: v_addc_u32
-; GCN: v_addc_u32
+; GCN: s_add_u32
+; GCN: s_addc_u32
+; GCN: s_addc_u32
+; GCN: s_addc_u32
define amdgpu_kernel void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
%foo = load i128, i128 addrspace(1)* %in, align 8
%result = add i128 %a, %foo
diff --git a/test/CodeGen/AMDGPU/add_i64.ll b/test/CodeGen/AMDGPU/add_i64.ll
index 62733d5bfb6c..f673d91192b8 100644
--- a/test/CodeGen/AMDGPU/add_i64.ll
+++ b/test/CodeGen/AMDGPU/add_i64.ll
@@ -19,8 +19,8 @@ define amdgpu_kernel void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 add
; Check that the SGPR add operand is correctly moved to a VGPR.
; SI-LABEL: {{^}}sgpr_operand:
-; SI: v_add_i32
-; SI: v_addc_u32
+; SI: s_add_u32
+; SI: s_addc_u32
define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
%foo = load i64, i64 addrspace(1)* %in, align 8
%result = add i64 %foo, %a
@@ -32,8 +32,8 @@ define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addr
; SGPR as other operand.
;
; SI-LABEL: {{^}}sgpr_operand_reversed:
-; SI: v_add_i32
-; SI: v_addc_u32
+; SI: s_add_u32
+; SI: s_addc_u32
define amdgpu_kernel void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
%foo = load i64, i64 addrspace(1)* %in, align 8
%result = add i64 %a, %foo
diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll
index b1e71722d80c..a6aa9e795151 100644
--- a/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -10,20 +10,22 @@
; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
+; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
+; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
+; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(15, 16, 16)
; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]
; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
-
-; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-
-; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
-; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]]
-; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
-; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
+; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
+; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
@@ -48,6 +50,12 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %pt
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
+; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], 0
+; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
+; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
+
; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(15, 0, 16)
; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16
@@ -55,12 +63,11 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %pt
; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base
-; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-
-; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], 0
-; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]]
-; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
-; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], 0
+; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
+; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
diff --git a/test/CodeGen/AMDGPU/alignbit-pat.ll b/test/CodeGen/AMDGPU/alignbit-pat.ll
index ff5c8960fad3..3f07188063cd 100644
--- a/test/CodeGen/AMDGPU/alignbit-pat.ll
+++ b/test/CodeGen/AMDGPU/alignbit-pat.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}alignbit_shr_pat:
; GCN-DAG: s_load_dword s[[SHR:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
index 0e5605961e10..0c7160df2b96 100644
--- a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -16,8 +16,8 @@ define amdgpu_kernel void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a,
; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
-; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+; CHECK: %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
+; CHECK: arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
%no.md = fdiv float %a, %b
store volatile float %no.md, float addrspace(1)* %out
@@ -110,15 +110,8 @@ define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2
; CHECK: %md.half.ulp = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !1
; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
-
-; CHECK: extractelement <2 x float> %x
-; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
-; CHECK: extractelement <2 x float> %x
-; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
-; CHECK: store volatile <2 x float> %arcp.25ulp
-
-; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
-; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
+; CHECK: %arcp.25ulp = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !0
+; CHECK: %fast.25ulp = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !0
; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
%no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
@@ -146,17 +139,8 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out
; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x{{$}}
-
-; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
-; CHECK: fdiv arcp float 1.000000e+00, %[[X0]], !fpmath !0
-; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: fdiv arcp float 2.000000e+00, %[[X1]], !fpmath !0
-; CHECK: store volatile <2 x float> %arcp.25ulp
-
-; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
-; CHECK: fdiv fast float 1.000000e+00, %[[X0]], !fpmath !0
-; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0
+; CHECK: %arcp.25ulp = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x, !fpmath !0
+; CHECK: %fast.25ulp = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x, !fpmath !0
; CHECK: store volatile <2 x float> %fast.25ulp
define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
%no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
@@ -179,12 +163,10 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace
; FIXME: Should be able to get fdiv for 1.0 component
; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant(
-; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
-; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
; CHECK: store volatile <2 x float> %arcp.25ulp
-; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
-; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0
; CHECK: store volatile <2 x float> %fast.25ulp
define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
%x.insert = insertelement <2 x float> %x, float 1.0, i32 0
@@ -204,8 +186,8 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> a
; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0
; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
-; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+; CHECK: %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
+; CHECK: %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
%no.md = fdiv float %a, %b
store volatile float %no.md, float addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/and-gcn.ll b/test/CodeGen/AMDGPU/and-gcn.ll
index 2aec03aff8a3..ef11ae87267e 100644
--- a/test/CodeGen/AMDGPU/and-gcn.ll
+++ b/test/CodeGen/AMDGPU/and-gcn.ll
@@ -2,8 +2,7 @@
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}v_and_i64_br:
-; SI: v_and_b32
-; SI: v_and_b32
+; SI: s_and_b64
define amdgpu_kernel void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
entry:
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
diff --git a/test/CodeGen/AMDGPU/and.ll b/test/CodeGen/AMDGPU/and.ll
index c356f8b87cfc..ee0190149e92 100644
--- a/test/CodeGen/AMDGPU/and.ll
+++ b/test/CodeGen/AMDGPU/and.ll
@@ -8,8 +8,8 @@ declare i32 @llvm.r600.read.tidig.x() #0
; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
@@ -26,10 +26,11 @@ define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspa
; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
@@ -136,7 +137,9 @@ define amdgpu_kernel void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrs
; FUNC-LABEL: {{^}}v_and_constant_i32
; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}}
define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
- %a = load i32, i32 addrspace(1)* %aptr, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+ %a = load i32, i32 addrspace(1)* %gep, align 4
%and = and i32 %a, 1234567
store i32 %and, i32 addrspace(1)* %out, align 4
ret void
@@ -145,7 +148,9 @@ define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrsp
; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32
; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}}
define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
- %a = load i32, i32 addrspace(1)* %aptr, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+ %a = load i32, i32 addrspace(1)* %gep, align 4
%and = and i32 %a, 64
store i32 %and, i32 addrspace(1)* %out, align 4
ret void
@@ -154,7 +159,9 @@ define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 a
; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32
; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}}
define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
- %a = load i32, i32 addrspace(1)* %aptr, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+ %a = load i32, i32 addrspace(1)* %gep, align 4
%and = and i32 %a, -16
store i32 %and, i32 addrspace(1)* %out, align 4
ret void
@@ -239,8 +246,11 @@ define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out
; SI: v_and_b32
; SI: v_and_b32
define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
- %a = load i64, i64 addrspace(1)* %aptr, align 8
- %b = load i64, i64 addrspace(1)* %bptr, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+ %a = load i64, i64 addrspace(1)* %gep.a, align 8
+ %gep.b = getelementptr i64, i64 addrspace(1)* %bptr, i32 %tid
+ %b = load i64, i64 addrspace(1)* %gep.b, align 8
%and = and i64 %a, %b
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
@@ -251,7 +261,9 @@ define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}}
; SI: buffer_store_dwordx2
define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
- %a = load i64, i64 addrspace(1)* %aptr, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+ %a = load i64, i64 addrspace(1)* %gep.a, align 8
%and = and i64 %a, 1231231234567
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
@@ -299,26 +311,30 @@ define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out
}
; FUNC-LABEL: {{^}}v_and_i64_32_bit_constant:
-; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
; SI-NOT: and
; SI: v_and_b32_e32 {{v[0-9]+}}, 0x12d687, [[VAL]]
; SI-NOT: and
; SI: buffer_store_dwordx2
define amdgpu_kernel void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
- %a = load i64, i64 addrspace(1)* %aptr, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+ %a = load i64, i64 addrspace(1)* %gep.a, align 8
%and = and i64 %a, 1234567
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
}
; FUNC-LABEL: {{^}}v_and_inline_imm_i64:
-; SI: buffer_load_dword v{{[0-9]+}}
+; SI: {{buffer|flat}}_load_dword v{{[0-9]+}}
; SI-NOT: and
; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
; SI-NOT: and
; SI: buffer_store_dwordx2
define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
- %a = load i64, i64 addrspace(1)* %aptr, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+ %a = load i64, i64 addrspace(1)* %gep.a, align 8
%and = and i64 %a, 64
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
@@ -326,13 +342,15 @@ define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addr
; FIXME: Should be able to reduce load width
; FUNC-LABEL: {{^}}v_and_inline_neg_imm_i64:
-; SI: buffer_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
; SI-NOT: and
; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]]
; SI-NOT: and
; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
define amdgpu_kernel void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
- %a = load i64, i64 addrspace(1)* %aptr, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+ %a = load i64, i64 addrspace(1)* %gep.a, align 8
%and = and i64 %a, -8
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
@@ -549,5 +567,4 @@ define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
}
-
attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll b/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
index c61c23222bc7..cdc60ab504e0 100644
--- a/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
+++ b/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
@@ -2,9 +2,9 @@
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}any_extend_vector_inreg_v16i8_to_v4i32:
-; GCN: {{buffer|flat}}_load_dwordx4
-; GCN-DAG: {{buffer|flat}}_load_dwordx4
-; GCN-DAG: {{buffer|flat}}_load_dword
+; GCN: s_load_dwordx4
+; GCN-DAG: s_load_dwordx4
+; GCN-DAG: s_load_dword
; GCN: {{buffer|flat}}_store_byte
; GCN: {{buffer|flat}}_store_byte
diff --git a/test/CodeGen/AMDGPU/bitreverse.ll b/test/CodeGen/AMDGPU/bitreverse.ll
index 539373f7bdeb..f29bfb46b94b 100644
--- a/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/test/CodeGen/AMDGPU/bitreverse.ll
@@ -2,6 +2,8 @@
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
declare i16 @llvm.bitreverse.i16(i16) #1
declare i32 @llvm.bitreverse.i32(i32) #1
declare i64 @llvm.bitreverse.i64(i64) #1
@@ -42,12 +44,14 @@ define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val)
}
; FUNC-LABEL: {{^}}v_brev_i32:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
; SI: buffer_store_dword [[RESULT]],
; SI: s_endpgm
define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
- %val = load i32, i32 addrspace(1)* %valptr
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %gep
%brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
store i32 %brev, i32 addrspace(1)* %out
ret void
@@ -66,7 +70,9 @@ define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; SI: v_bfrev_b32_e32
; SI: v_bfrev_b32_e32
define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
- %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
+ %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep
%brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
ret void
@@ -82,7 +88,9 @@ define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val)
; FUNC-LABEL: {{^}}v_brev_i64:
; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0
define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
- %val = load i64, i64 addrspace(1)* %valptr
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i64, i64 addrspace(1)* %valptr, i32 %tid
+ %val = load i64, i64 addrspace(1)* %gep
%brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
store i64 %brev, i64 addrspace(1)* %out
ret void
@@ -97,7 +105,9 @@ define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
; FUNC-LABEL: {{^}}v_brev_v2i64:
define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
- %val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <2 x i64> , <2 x i64> addrspace(1)* %valptr, i32 %tid
+ %val = load <2 x i64>, <2 x i64> addrspace(1)* %gep
%brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
ret void
diff --git a/test/CodeGen/AMDGPU/bswap.ll b/test/CodeGen/AMDGPU/bswap.ll
index d2dacd7c17b3..eb3fc2fab34f 100644
--- a/test/CodeGen/AMDGPU/bswap.ll
+++ b/test/CodeGen/AMDGPU/bswap.ll
@@ -10,7 +10,7 @@ declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone
declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
; FUNC-LABEL: @test_bswap_i32
-; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI: s_load_dword [[VAL:s[0-9]+]]
; SI-DAG: v_alignbit_b32 [[TMP0:v[0-9]+]], [[VAL]], [[VAL]], 8
; SI-DAG: v_alignbit_b32 [[TMP1:v[0-9]+]], [[VAL]], [[VAL]], 24
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xff00ff
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
index 5dec3e35ab3d..c114332a5887 100644
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@@ -1,9 +1,9 @@
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-CIVI %s
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-CIVI %s
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
+; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
+; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
; OPT-LABEL: @test_no_sink_flat_small_offset_i32(
; OPT-CIVI: getelementptr i32, i32 addrspace(4)* %in
@@ -40,7 +40,7 @@ done:
; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32(
; OPT: getelementptr i32, i32 addrspace(4)* %out,
-; OPT-CI-NOT: getelementptr
+; rOPT-CI-NOT: getelementptr
; OPT: br i1
; OPT-CI: addrspacecast
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index c1cf56e5058e..c01d834bc33d 100644
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -1,9 +1,9 @@
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
diff --git a/test/CodeGen/AMDGPU/clamp-omod-special-case.mir b/test/CodeGen/AMDGPU/clamp-omod-special-case.mir
index 6ecf75c1acec..90fba0342090 100644
--- a/test/CodeGen/AMDGPU/clamp-omod-special-case.mir
+++ b/test/CodeGen/AMDGPU/clamp-omod-special-case.mir
@@ -1,36 +1,4 @@
# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fold-operands %s -o - | FileCheck -check-prefix=GCN %s
---- |
- define amdgpu_ps void @v_max_self_clamp_not_set_f32() #0 {
- ret void
- }
-
- define amdgpu_ps void @v_clamp_omod_already_set_f32() #0 {
- ret void
- }
-
- define amdgpu_ps void @v_omod_mul_omod_already_set_f32() #0 {
- ret void
- }
-
- define amdgpu_ps void @v_omod_mul_clamp_already_set_f32() #0 {
- ret void
- }
-
- define amdgpu_ps void @v_omod_add_omod_already_set_f32() #0 {
- ret void
- }
-
- define amdgpu_ps void @v_omod_add_clamp_already_set_f32() #0 {
- ret void
- }
-
- define amdgpu_ps void @v_max_reg_imm_f32() #0 {
- ret void
- }
-
- attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" }
-
-...
---
# GCN-LABEL: name: v_max_self_clamp_not_set_f32
# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
@@ -70,7 +38,7 @@ liveins:
- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
- { reg: '%vgpr0', virtual-reg: '%3' }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
@@ -132,7 +100,7 @@ liveins:
- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
- { reg: '%vgpr0', virtual-reg: '%3' }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
@@ -195,7 +163,7 @@ liveins:
- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
- { reg: '%vgpr0', virtual-reg: '%3' }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
@@ -260,7 +228,7 @@ liveins:
- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
- { reg: '%vgpr0', virtual-reg: '%3' }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
@@ -337,7 +305,7 @@ liveins:
- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
- { reg: '%vgpr0', virtual-reg: '%3' }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
@@ -402,7 +370,7 @@ liveins:
- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
- { reg: '%vgpr0', virtual-reg: '%3' }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
@@ -435,7 +403,7 @@ registers:
- { id: 0, class: vgpr_32 }
- { id: 1, class: vgpr_32 }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %vgpr0
%0 = COPY %vgpr0
diff --git a/test/CodeGen/AMDGPU/coalescer_remat.ll b/test/CodeGen/AMDGPU/coalescer_remat.ll
index 3e1b76a1df09..14b798ba822b 100644
--- a/test/CodeGen/AMDGPU/coalescer_remat.ll
+++ b/test/CodeGen/AMDGPU/coalescer_remat.ll
@@ -12,7 +12,7 @@ declare float @llvm.fma.f32(float, float, float)
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0
; It's probably OK if this is slightly higher:
-; CHECK: ; NumVgprs: 8
+; CHECK: ; NumVgprs: 4
define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
entry:
%cmpflag = icmp eq i32 %flag, 1
diff --git a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
index ed78ccc9b617..0401f7b07e21 100644
--- a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
+++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
@@ -1,84 +1,5 @@
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination -o - %s | FileCheck -check-prefix=GCN %s
---- |
- define amdgpu_kernel void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
- %and = and i32 %a, 1234567
- store volatile i32 %and, i32 addrspace(1)* %out
- ret void
- }
-
- define amdgpu_kernel void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %idxprom = sext i32 %tid to i64
- %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
- %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
- %a = load i32, i32 addrspace(1)* %gep.a
- %and = and i32 %a, 1234567
- store i32 %and, i32 addrspace(1)* %gep.out
- ret void
- }
-
- define amdgpu_kernel void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
- %shl = shl i32 %a, 12
- store volatile i32 %shl, i32 addrspace(1)* %out
- ret void
- }
-
- define amdgpu_kernel void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %idxprom = sext i32 %tid to i64
- %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
- %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
- %a = load i32, i32 addrspace(1)* %gep.a
- %shl = shl i32 %a, 12
- store i32 %shl, i32 addrspace(1)* %gep.out
- ret void
- }
-
- define amdgpu_kernel void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
- %ashr = ashr i32 %a, 12
- store volatile i32 %ashr, i32 addrspace(1)* %out
- ret void
- }
-
- define amdgpu_kernel void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %idxprom = sext i32 %tid to i64
- %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
- %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
- %a = load i32, i32 addrspace(1)* %gep.a
- %ashr = ashr i32 %a, 12
- store i32 %ashr, i32 addrspace(1)* %gep.out
- ret void
- }
-
- define amdgpu_kernel void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
- %lshr = lshr i32 %a, 12
- store volatile i32 %lshr, i32 addrspace(1)* %out
- ret void
- }
-
- define amdgpu_kernel void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %idxprom = sext i32 %tid to i64
- %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
- %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
- %a = load i32, i32 addrspace(1)* %gep.a
- %lshr = lshr i32 %a, 12
- store i32 %lshr, i32 addrspace(1)* %gep.out
- ret void
- }
-
- define amdgpu_kernel void @undefined_vreg_operand() {
- unreachable
- }
-
- declare i32 @llvm.amdgcn.workitem.id.x() #1
-
- attributes #0 = { nounwind }
- attributes #1 = { nounwind readnone }
-
...
----
# GCN-LABEL: name: s_fold_and_imm_regimm_32{{$}}
# GCN: %10 = V_MOV_B32_e32 1543, implicit %exec
@@ -119,11 +40,11 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1
%0 = COPY %sgpr0_sgpr1
- %1 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %1 = S_LOAD_DWORDX2_IMM %0, 36, 0
%2 = COPY %1.sub1
%3 = COPY %1.sub0
%4 = S_MOV_B32 61440
@@ -133,7 +54,7 @@ body: |
%8 = S_MOV_B32 9999
%9 = S_AND_B32 killed %7, killed %8, implicit-def dead %scc
%10 = COPY %9
- BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+ BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, implicit %exec
S_ENDPGM
...
@@ -204,12 +125,12 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 36, 0
%31 = V_ASHRREV_I32_e64 31, %3, implicit %exec
%32 = REG_SEQUENCE %3, 1, %31, 2
%33 = V_LSHLREV_B64 2, killed %32, implicit %exec
@@ -223,19 +144,19 @@ body: |
%34 = V_MOV_B32_e32 63, implicit %exec
%27 = V_AND_B32_e64 %26, %24, implicit %exec
- FLAT_STORE_DWORD %37, %27, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %37, %27, 0, 0, 0, implicit %exec, implicit %flat_scr
%28 = V_AND_B32_e64 %24, %26, implicit %exec
- FLAT_STORE_DWORD %37, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %37, %28, 0, 0, 0, implicit %exec, implicit %flat_scr
%29 = V_AND_B32_e32 %26, %24, implicit %exec
- FLAT_STORE_DWORD %37, %29, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %37, %29, 0, 0, 0, implicit %exec, implicit %flat_scr
%30 = V_AND_B32_e64 %26, %26, implicit %exec
- FLAT_STORE_DWORD %37, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %37, %30, 0, 0, 0, implicit %exec, implicit %flat_scr
%31 = V_AND_B32_e64 %34, %34, implicit %exec
- FLAT_STORE_DWORD %37, %31, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %37, %31, 0, 0, 0, implicit %exec, implicit %flat_scr
S_ENDPGM
@@ -285,11 +206,11 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 36, 0
%5 = S_MOV_B32 1
%6 = COPY %4.sub1
%7 = COPY %4.sub0
@@ -298,7 +219,7 @@ body: |
%10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4
%12 = S_LSHL_B32 killed %5, 12, implicit-def dead %scc
%13 = COPY %12
- BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+ BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec
S_ENDPGM
...
@@ -390,7 +311,7 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%2 = COPY %vgpr0
@@ -411,34 +332,34 @@ body: |
%27 = S_MOV_B32 -4
%11 = V_LSHLREV_B32_e64 12, %10, implicit %exec
- FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr
%12 = V_LSHLREV_B32_e64 %7, 12, implicit %exec
- FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr
%13 = V_LSHL_B32_e64 %7, 12, implicit %exec
- FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr
%14 = V_LSHL_B32_e64 12, %7, implicit %exec
- FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr
%15 = V_LSHL_B32_e64 12, %24, implicit %exec
- FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr
%22 = V_LSHL_B32_e64 %6, 12, implicit %exec
- FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr
%23 = V_LSHL_B32_e64 %6, 32, implicit %exec
- FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr
%25 = V_LSHL_B32_e32 %6, %6, implicit %exec
- FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr
%26 = V_LSHLREV_B32_e32 11, %24, implicit %exec
- FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr
%28 = V_LSHL_B32_e32 %27, %6, implicit %exec
- FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr
S_ENDPGM
@@ -485,11 +406,11 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 36, 0
%5 = S_MOV_B32 999123
%6 = COPY %4.sub1
%7 = COPY %4.sub0
@@ -498,7 +419,7 @@ body: |
%10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4
%12 = S_ASHR_I32 killed %5, 12, implicit-def dead %scc
%13 = COPY %12
- BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+ BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec
S_ENDPGM
...
@@ -593,12 +514,12 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%2 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %3 = S_LOAD_DWORDX2_IMM %0, 36, 0
%15 = V_ASHRREV_I32_e64 31, %2, implicit %exec
%16 = REG_SEQUENCE %2, 1, %15, 2
%17 = V_LSHLREV_B64 2, killed %16, implicit %exec
@@ -619,34 +540,34 @@ body: |
%35 = V_MOV_B32_e32 2, implicit %exec
%11 = V_ASHRREV_I32_e64 8, %10, implicit %exec
- FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr
%12 = V_ASHRREV_I32_e64 %8, %10, implicit %exec
- FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr
%13 = V_ASHR_I32_e64 %7, 3, implicit %exec
- FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr
%14 = V_ASHR_I32_e64 7, %32, implicit %exec
- FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr
%15 = V_ASHR_I32_e64 %27, %24, implicit %exec
- FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr
%22 = V_ASHR_I32_e64 %6, 4, implicit %exec
- FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr
%23 = V_ASHR_I32_e64 %6, %33, implicit %exec
- FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr
%25 = V_ASHR_I32_e32 %34, %34, implicit %exec
- FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr
%26 = V_ASHRREV_I32_e32 11, %10, implicit %exec
- FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr
%28 = V_ASHR_I32_e32 %27, %35, implicit %exec
- FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr
S_ENDPGM
@@ -693,11 +614,11 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 36, 0
%5 = S_MOV_B32 -999123
%6 = COPY %4.sub1
%7 = COPY %4.sub0
@@ -706,7 +627,7 @@ body: |
%10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4
%12 = S_LSHR_B32 killed %5, 12, implicit-def dead %scc
%13 = COPY %12
- BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+ BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec
S_ENDPGM
...
@@ -802,12 +723,12 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%2 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %3 = S_LOAD_DWORDX2_IMM %0, 36, 0
%15 = V_ASHRREV_I32_e64 31, %2, implicit %exec
%16 = REG_SEQUENCE %2, 1, %15, 2
%17 = V_LSHLREV_B64 2, killed %16, implicit %exec
@@ -828,34 +749,34 @@ body: |
%35 = V_MOV_B32_e32 2, implicit %exec
%11 = V_LSHRREV_B32_e64 8, %10, implicit %exec
- FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr
%12 = V_LSHRREV_B32_e64 %8, %10, implicit %exec
- FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr
%13 = V_LSHR_B32_e64 %7, 3, implicit %exec
- FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr
%14 = V_LSHR_B32_e64 7, %32, implicit %exec
- FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr
%15 = V_LSHR_B32_e64 %27, %24, implicit %exec
- FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr
%22 = V_LSHR_B32_e64 %6, 4, implicit %exec
- FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr
%23 = V_LSHR_B32_e64 %6, %33, implicit %exec
- FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr
%25 = V_LSHR_B32_e32 %34, %34, implicit %exec
- FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr
%26 = V_LSHRREV_B32_e32 11, %10, implicit %exec
- FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr
%28 = V_LSHR_B32_e32 %27, %35, implicit %exec
- FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr
S_ENDPGM
diff --git a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
index 8611cd080e15..09d4b2c8bd77 100644
--- a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
+++ b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
@@ -107,7 +107,7 @@ define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}}
; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}}
; GCN-DAG: v_not_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]]
-; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]]
+; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]], v[[VREG1_LO]]
; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]]
; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
define amdgpu_kernel void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
diff --git a/test/CodeGen/AMDGPU/copy-illegal-type.ll b/test/CodeGen/AMDGPU/copy-illegal-type.ll
index d772d1b67936..e39bd60a1cc8 100644
--- a/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -5,35 +5,41 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
; FUNC-LABEL: {{^}}test_copy_v4i8:
-; GCN: buffer_load_dword [[REG:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
; GCN: buffer_store_dword [[REG]]
; GCN: s_endpgm
define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
- %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+ %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
+ %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: {{^}}test_copy_v4i8_x2:
-; GCN: buffer_load_dword [[REG:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
; GCN: buffer_store_dword [[REG]]
; GCN: buffer_store_dword [[REG]]
; GCN: s_endpgm
define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
- %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+ %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
+ %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
ret void
}
; FUNC-LABEL: {{^}}test_copy_v4i8_x3:
-; GCN: buffer_load_dword [[REG:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
; GCN: buffer_store_dword [[REG]]
; GCN: buffer_store_dword [[REG]]
; GCN: buffer_store_dword [[REG]]
; GCN: s_endpgm
define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
- %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+ %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
+ %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
@@ -41,14 +47,16 @@ define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x
}
; FUNC-LABEL: {{^}}test_copy_v4i8_x4:
-; GCN: buffer_load_dword [[REG:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
; GCN: buffer_store_dword [[REG]]
; GCN: buffer_store_dword [[REG]]
; GCN: buffer_store_dword [[REG]]
; GCN: buffer_store_dword [[REG]]
; GCN: s_endpgm
define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
- %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+ %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
+ %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
@@ -57,7 +65,7 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x
}
; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
; GCN-DAG: v_lshrrev_b32
; GCN: v_and_b32
; GCN: v_or_b32
@@ -66,7 +74,9 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x
; GCN: s_endpgm
define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
- %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+ %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
+ %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
%add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
@@ -97,19 +107,21 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
}
; FUNC-LABEL: {{^}}test_copy_v3i8_align4:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
; GCN: s_endpgm
define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
- %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
+ %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x
+ %val = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: {{^}}test_copy_v3i8_align2:
-; GCN-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
+; GCN-DAG: {{buffer|flat}}_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: {{buffer|flat}}_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
; GCN: s_endpgm
@@ -120,9 +132,9 @@ define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3
}
; FUNC-LABEL: {{^}}test_copy_v3i8_align1:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
@@ -135,10 +147,10 @@ define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3
}
; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
; GCN: buffer_store_dword
; GCN: s_endpgm
define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
@@ -148,10 +160,10 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %
}
; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
diff --git a/test/CodeGen/AMDGPU/ctlz.ll b/test/CodeGen/AMDGPU/ctlz.ll
index 149c50685b1d..a544cbe890b5 100644
--- a/test/CodeGen/AMDGPU/ctlz.ll
+++ b/test/CodeGen/AMDGPU/ctlz.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
@@ -34,9 +34,9 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val)
}
; FUNC-LABEL: {{^}}v_ctlz_i32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN-DAG: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]]
-; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[CTLZ]]
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
+; GCN: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]]
+; GCN: v_cmp_ne_u32_e32 vcc, 0, [[VAL]]
; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 32, [[CTLZ]], vcc
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
@@ -44,14 +44,16 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val)
; EG: FFBH_UINT
; EG: CNDE_INT
define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
store i32 %ctlz, i32 addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: {{^}}v_ctlz_v2i32:
-; GCN: buffer_load_dwordx2
+; GCN: {{buffer|flat}}_load_dwordx2
; GCN: v_ffbh_u32_e32
; GCN: v_ffbh_u32_e32
; GCN: buffer_store_dwordx2
@@ -62,14 +64,16 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
; EG: FFBH_UINT
; EG: CNDE_INT
define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
- %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
+ %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
%ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
ret void
}
; FUNC-LABEL: {{^}}v_ctlz_v4i32:
-; GCN: buffer_load_dwordx4
+; GCN: {{buffer|flat}}_load_dwordx4
; GCN: v_ffbh_u32_e32
; GCN: v_ffbh_u32_e32
; GCN: v_ffbh_u32_e32
@@ -90,16 +94,25 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; EG-DAG: FFBH_UINT
; EG-DAG: CNDE_INT
define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
- %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
+ %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
%ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
ret void
}
; FUNC-LABEL: {{^}}v_ctlz_i8:
-; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
-; SI-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; VI-DAG: v_ffbh_u32_sdwa [[RESULT:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
+; SI-DAG: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
+; VI-DAG: v_ffbh_u32_sdwa [[FFBH:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; SI: v_cmp_ne_u32_e32 vcc, 0, [[VAL]]
+; VI: v_cmp_ne_u16_e32 vcc, 0, [[VAL]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 32, [[FFBH]], vcc
+
+; SI: v_subrev_i32_e32 [[RESULT:v[0-9]+]], vcc, 24, [[SELECT]]
+; VI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, -16, [[SELECT]]
; GCN: buffer_store_byte [[RESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
@@ -136,12 +149,12 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
; FUNC-LABEL: {{^}}v_ctlz_i64:
; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
-; GCN-DAG: v_cmp_eq_u32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
+; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, v[[HI]]
; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
-; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[CMPHI]]
-; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[HI]], v[[LO]]
+; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], vcc
+; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[LO]], v[[HI]]
; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]]
; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc
; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI:[0-9]+]]{{\]}}
@@ -168,12 +181,14 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
}
; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_neg1:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
- define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
%cmp = icmp eq i32 %val, 0
%sel = select i1 %cmp, i32 -1, i32 %ctlz
@@ -182,12 +197,14 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
}
; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_neg1:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
%cmp = icmp ne i32 %val, 0
%sel = select i1 %cmp, i32 %ctlz, i32 -1
@@ -197,13 +214,15 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out
; TODO: Should be able to eliminate select here as well.
; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_bitwidth:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
; GCN: v_ffbh_u32_e32
; GCN: v_cmp
; GCN: v_cndmask
; GCN: s_endpgm
define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
%cmp = icmp eq i32 %ctlz, 32
%sel = select i1 %cmp, i32 -1, i32 %ctlz
@@ -212,13 +231,15 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
}
; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_bitwidth:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
; GCN: v_ffbh_u32_e32
; GCN: v_cmp
; GCN: v_cndmask
; GCN: s_endpgm
define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
%cmp = icmp ne i32 %ctlz, 32
%sel = select i1 %cmp, i32 %ctlz, i32 -1
@@ -242,7 +263,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
}
; FUNC-LABEL: {{^}}v_ctlz_i16_sel_eq_neg1:
-; SI: buffer_load_ushort [[VAL:v[0-9]+]],
+; SI: {{buffer|flat}}_load_ushort [[VAL:v[0-9]+]],
; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
; SI: buffer_store_short [[FFBH]],
define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
diff --git a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 48f3e4401f1a..7500da536307 100644
--- a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -29,21 +29,23 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out,
}
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
; EG: FFBH_UINT {{\*? *}}[[RESULT]]
define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
store i32 %ctlz, i32 addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v2i32:
-; GCN: buffer_load_dwordx2
+; GCN: {{buffer|flat}}_load_dwordx2
; GCN: v_ffbh_u32_e32
; GCN: v_ffbh_u32_e32
; GCN: buffer_store_dwordx2
@@ -52,14 +54,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out,
; EG: FFBH_UINT {{\*? *}}[[RESULT]]
; EG: FFBH_UINT {{\*? *}}[[RESULT]]
define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
- %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
+ %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
%ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
ret void
}
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v4i32:
-; GCN: buffer_load_dwordx4
+; GCN: {{buffer|flat}}_load_dwordx4
; GCN: v_ffbh_u32_e32
; GCN: v_ffbh_u32_e32
; GCN: v_ffbh_u32_e32
@@ -72,18 +76,22 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noali
; EG: FFBH_UINT {{\*? *}}[[RESULT]]
; EG: FFBH_UINT {{\*? *}}[[RESULT]]
define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
- %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
+ %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
%ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
ret void
}
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i8:
-; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GCN: buffer_store_byte [[RESULT]],
define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
- %val = load i8, i8 addrspace(1)* %valptr
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
+ %val = load i8, i8 addrspace(1)* %in.gep
%ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
store i8 %ctlz, i8 addrspace(1)* %out
ret void
@@ -116,11 +124,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64:
; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
-; GCN-DAG: v_cmp_eq_u32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
+; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, v[[HI]]
; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
-; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
+; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI:[0-9]+]]{{\]}}
define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
%tid = call i32 @llvm.r600.read.tidig.x()
@@ -145,11 +153,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias
}
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[RESULT]],
- define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
%cmp = icmp eq i32 %val, 0
%sel = select i1 %cmp, i32 -1, i32 %ctlz
@@ -158,11 +168,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias
}
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_neg1:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[RESULT]],
define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
%cmp = icmp ne i32 %val, 0
%sel = select i1 %cmp, i32 %ctlz, i32 -1
@@ -186,15 +198,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noa
}
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN-DAG: v_ffbh_u32_e32 [[RESULT0:v[0-9]+]], [[VAL]]
; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, [[VAL]]
; GCN-DAG: v_cndmask_b32_e64 [[RESULT1:v[0-9]+]], 0, 1, vcc
; GCN-DAG: buffer_store_dword [[RESULT0]]
; GCN-DAG: buffer_store_byte [[RESULT1]]
; GCN: s_endpgm
- define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
%cmp = icmp eq i32 %val, 0
%sel = select i1 %cmp, i32 -1, i32 %ctlz
@@ -205,13 +219,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noa
; Selected on wrong constant
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_0:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
; GCN: v_ffbh_u32_e32
; GCN: v_cmp
; GCN: v_cndmask
; GCN: buffer_store_dword
- define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
%cmp = icmp eq i32 %val, 0
%sel = select i1 %cmp, i32 0, i32 %ctlz
@@ -221,13 +237,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noa
; Selected on wrong constant
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_0:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
; GCN: v_ffbh_u32_e32
; GCN: v_cmp
; GCN: v_cndmask
; GCN: buffer_store_dword
define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
%cmp = icmp ne i32 %val, 0
%sel = select i1 %cmp, i32 %ctlz, i32 0
@@ -237,13 +255,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noal
; Compare on wrong constant
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
; GCN: v_ffbh_u32_e32
; GCN: v_cmp
; GCN: v_cndmask
; GCN: buffer_store_dword
- define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
%cmp = icmp eq i32 %val, 1
%sel = select i1 %cmp, i32 0, i32 %ctlz
@@ -253,13 +273,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noal
; Selected on wrong constant
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
; GCN: v_ffbh_u32_e32
; GCN: v_cmp
; GCN: v_cndmask
; GCN: buffer_store_dword
define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
%cmp = icmp ne i32 %val, 1
%sel = select i1 %cmp, i32 %ctlz, i32 0
diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll
index aa913ad406d2..68b39bad2bc1 100644
--- a/test/CodeGen/AMDGPU/ctpop.ll
+++ b/test/CodeGen/AMDGPU/ctpop.ll
@@ -8,6 +8,8 @@ declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone
declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
; FUNC-LABEL: {{^}}s_ctpop_i32:
; GCN: s_load_dword [[SVAL:s[0-9]+]],
; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]]
@@ -24,22 +26,24 @@ define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val)
; XXX - Why 0 in register?
; FUNC-LABEL: {{^}}v_ctpop_i32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 0
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
- %val = load i32, i32 addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
%ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
store i32 %ctpop, i32 addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
-; GCN: buffer_load_dword [[VAL1:v[0-9]+]],
-; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL1:v[0-9]+]],
; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
@@ -49,8 +53,11 @@ define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrs
; EG: BCNT_INT
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
- %val0 = load i32, i32 addrspace(1)* %in0, align 4
- %val1 = load i32, i32 addrspace(1)* %in1, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %tid
+ %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %tid
+ %val0 = load i32, i32 addrspace(1)* %in0.gep, align 4
+ %val1 = load i32, i32 addrspace(1)* %in1.gep, align 4
%ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
%ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
%add = add i32 %ctpop0, %ctpop1
@@ -59,15 +66,17 @@ define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out,
}
; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32:
-; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]],
; GCN: s_waitcnt
; GCN-NEXT: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
-define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
- %val0 = load i32, i32 addrspace(1)* %in0, align 4
- %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
- %add = add i32 %ctpop0, %sval
+define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %sval) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
+ %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+ %add = add i32 %ctpop, %sval
store i32 %add, i32 addrspace(1)* %out, align 4
ret void
}
@@ -80,7 +89,9 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out,
; EG: BCNT_INT
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
- %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid
+ %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
%ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
ret void
@@ -98,7 +109,9 @@ define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <
; EG: BCNT_INT
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
- %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid
+ %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
%ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
ret void
@@ -124,7 +137,9 @@ define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <
; EG: BCNT_INT
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
- %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <8 x i32>, <8 x i32> addrspace(1)* %in, i32 %tid
+ %val = load <8 x i32>, <8 x i32> addrspace(1)* %in.gep, align 32
%ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
ret void
@@ -166,21 +181,25 @@ define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <
; EG: BCNT_INT
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
- %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <16 x i32>, <16 x i32> addrspace(1)* %in, i32 %tid
+ %val = load <16 x i32>, <16 x i32> addrspace(1)* %in.gep, align 32
%ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
ret void
}
; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
- %val = load i32, i32 addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
%ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
%add = add i32 %ctpop, 4
store i32 %add, i32 addrspace(1)* %out, align 4
@@ -188,14 +207,16 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noa
}
; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
- %val = load i32, i32 addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
%ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
%add = add i32 4, %ctpop
store i32 %add, i32 addrspace(1)* %out, align 4
@@ -203,14 +224,16 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)*
}
; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal:
-; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN-DAG: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
- %val = load i32, i32 addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
%ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
%add = add i32 %ctpop, 99999
store i32 %add, i32 addrspace(1)* %out, align 4
@@ -218,7 +241,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %ou
}
; FUNC-LABEL: {{^}}v_ctpop_i32_add_var:
-; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
; GCN: buffer_store_dword [[RESULT]],
@@ -226,7 +249,9 @@ define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %ou
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
- %val = load i32, i32 addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
%ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
%add = add i32 %ctpop, %const
store i32 %add, i32 addrspace(1)* %out, align 4
@@ -234,7 +259,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i
}
; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv:
-; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
; GCN: buffer_store_dword [[RESULT]],
@@ -242,7 +267,9 @@ define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
- %val = load i32, i32 addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
%ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
%add = add i32 %const, %ctpop
store i32 %add, i32 addrspace(1)* %out, align 4
@@ -250,18 +277,22 @@ define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %ou
}
; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv:
-; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], {{0$}}
-; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
-; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; SI: buffer_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
+; SI: buffer_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
+; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]]
+; VI: flat_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
+; VI: flat_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
; EG: BCNT_INT
define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
- %val = load i32, i32 addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
%ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
- %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4
+ %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 %tid
%const = load i32, i32 addrspace(1)* %gep, align 4
%add = add i32 %const, %ctpop
store i32 %add, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll
index f18bd9fd8174..4850370851f6 100644
--- a/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/test/CodeGen/AMDGPU/ctpop64.ll
@@ -1,6 +1,8 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone
@@ -25,14 +27,16 @@ define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val)
}
; FUNC-LABEL: {{^}}v_ctpop_i64:
-; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
+; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
- %val = load i64, i64 addrspace(1)* %in, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+ %val = load i64, i64 addrspace(1)* %in.gep, align 8
%ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
%truncctpop = trunc i64 %ctpop to i32
store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@@ -40,7 +44,7 @@ define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrs
}
; FUNC-LABEL: {{^}}v_ctpop_i64_user:
-; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
+; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
@@ -49,7 +53,9 @@ define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrs
; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
; GCN: s_endpgm
define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
- %val = load i64, i64 addrspace(1)* %in, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+ %val = load i64, i64 addrspace(1)* %in.gep, align 8
%ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
%or = or i64 %ctpop, %s.val
store i64 %or, i64 addrspace(1)* %out
@@ -87,7 +93,9 @@ define amdgpu_kernel void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <
; GCN: v_bcnt_u32_b32
; GCN: s_endpgm
define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
- %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i32 %tid
+ %val = load <2 x i64>, <2 x i64> addrspace(1)* %in.gep, align 16
%ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
%truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
@@ -105,7 +113,9 @@ define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <
; GCN: v_bcnt_u32_b32
; GCN: s_endpgm
define amdgpu_kernel void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
- %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
+ %val = load <4 x i64>, <4 x i64> addrspace(1)* %in.gep, align 32
%ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
%truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
@@ -169,7 +179,8 @@ define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val)
; FIXME: Should not have extra add
; FUNC-LABEL: {{^}}v_ctpop_i128:
-; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+; VI: flat_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}
; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0
; GCN-DAG: v_bcnt_u32_b32{{(_e32)*(_e64)*}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]]
@@ -182,7 +193,9 @@ define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val)
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
- %val = load i128, i128 addrspace(1)* %in, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %tid
+ %val = load i128, i128 addrspace(1)* %in.gep, align 8
%ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
%truncctpop = trunc i128 %ctpop to i32
store i32 %truncctpop, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 1fa6407647eb..1bfd38d94bfd 100644
--- a/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -5,6 +5,7 @@
declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32:
; SI: s_load_dword [[VAL:s[0-9]+]],
@@ -21,21 +22,23 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out,
}
; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
; SI: buffer_store_dword [[RESULT]],
; SI: s_endpgm
; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
; EG: FFBL_INT {{\*? *}}[[RESULT]]
define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
- %val = load i32, i32 addrspace(1)* %valptr, align 4
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+ %val = load i32, i32 addrspace(1)* %in.gep, align 4
%cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
store i32 %cttz, i32 addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32:
-; SI: buffer_load_dwordx2
+; SI: {{buffer|flat}}_load_dwordx2
; SI: v_ffbl_b32_e32
; SI: v_ffbl_b32_e32
; SI: buffer_store_dwordx2
@@ -44,14 +47,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out,
; EG: FFBL_INT {{\*? *}}[[RESULT]]
; EG: FFBL_INT {{\*? *}}[[RESULT]]
define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
- %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
+ %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
%cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
ret void
}
; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32:
-; SI: buffer_load_dwordx4
+; SI: {{buffer|flat}}_load_dwordx4
; SI: v_ffbl_b32_e32
; SI: v_ffbl_b32_e32
; SI: v_ffbl_b32_e32
@@ -64,7 +69,9 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noali
; EG: FFBL_INT {{\*? *}}[[RESULT]]
; EG: FFBL_INT {{\*? *}}[[RESULT]]
define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
- %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
+ %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
%cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
ret void
diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 0328ce31002d..f839129fc3d8 100644
--- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -5,46 +5,52 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
; GCN-LABEL: {{^}}load_i8_to_f32:
-; GCN: buffer_load_ubyte [[LOADREG:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_ubyte [[LOADREG:v[0-9]+]],
; GCN-NOT: bfe
; GCN-NOT: lshr
; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
; GCN: buffer_store_dword [[CONV]],
define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
- %load = load i8, i8 addrspace(1)* %in, align 1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
+ %load = load i8, i8 addrspace(1)* %gep, align 1
%cvt = uitofp i8 %load to float
store float %cvt, float addrspace(1)* %out, align 4
ret void
}
; GCN-LABEL: {{^}}load_v2i8_to_v2f32:
-; GCN: buffer_load_ushort [[LD:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort [[LD:v[0-9]+]]
; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]]
; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]]
; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
- %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid
+ %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2
%cvt = uitofp <2 x i8> %load to <2 x float>
store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
ret void
}
; GCN-LABEL: {{^}}load_v3i8_to_v3f32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
; GCN-NOT: v_cvt_f32_ubyte3_e32
; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]]
; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
- %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid
+ %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
%cvt = uitofp <3 x i8> %load to <3 x float>
store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
ret void
}
; GCN-LABEL: {{^}}load_v4i8_to_v4f32:
-; GCN: buffer_load_dword [[LOADREG:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]]
; GCN-NOT: bfe
; GCN-NOT: lshr
; GCN-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
@@ -53,7 +59,9 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias
; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
- %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
+ %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
%cvt = uitofp <4 x i8> %load to <4 x float>
store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
ret void
@@ -64,10 +72,10 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias
; FIXME: Packing bytes
; GCN-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
-; GCN: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
-; GCN: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
-; GCN: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
-; GCN: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ubyte [[LOADREG3:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ubyte [[LOADREG2:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ubyte [[LOADREG1:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ubyte [[LOADREG0:v[0-9]+]]
; GCN-DAG: v_lshlrev_b32
; GCN-DAG: v_or_b32
; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]],
@@ -77,7 +85,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias
; GCN: buffer_store_dwordx4
define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
- %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
+ %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
%cvt = uitofp <4 x i8> %load to <4 x float>
store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
ret void
@@ -124,14 +134,16 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
; GCN-LABEL: {{^}}load_v7i8_to_v7f32:
; GCN: s_endpgm
define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
- %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid
+ %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1
%cvt = uitofp <7 x i8> %load to <7 x float>
store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
ret void
}
; GCN-LABEL: {{^}}load_v8i8_to_v8f32:
-; GCN: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
+; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
; GCN-NOT: bfe
; GCN-NOT: lshr
; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]]
@@ -147,19 +159,23 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
- %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid
+ %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8
%cvt = uitofp <8 x i8> %load to <8 x float>
store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
ret void
}
; GCN-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
-; GCN: buffer_load_dword [[LOADREG:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]],
; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
; GCN: buffer_store_dword [[CONV]],
define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
- %load = load i32, i32 addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %load = load i32, i32 addrspace(1)* %gep, align 4
%add = add i32 %load, 2
%inreg = and i32 %add, 255
%cvt = uitofp i32 %inreg to float
@@ -169,7 +185,9 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias
; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
- %load = load i32, i32 addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %load = load i32, i32 addrspace(1)* %gep, align 4
%inreg = and i32 %load, 65280
%shr = lshr i32 %inreg, 8
%cvt = uitofp i32 %shr to float
@@ -181,7 +199,9 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias
; them so it shouldn't really matter.
; GCN-LABEL: {{^}}i8_zext_i32_to_f32:
define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
- %load = load i8, i8 addrspace(1)* %in, align 1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
+ %load = load i8, i8 addrspace(1)* %gep, align 1
%ext = zext i8 %load to i32
%cvt = uitofp i32 %ext to float
store float %cvt, float addrspace(1)* %out, align 4
@@ -190,7 +210,9 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out,
; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32:
define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
- %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
+ %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
%ext = zext <4 x i8> %load to <4 x i32>
%cvt = uitofp <4 x i32> %ext to <4 x float>
store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
@@ -198,12 +220,14 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
}
; GCN-LABEL: {{^}}extract_byte0_to_f32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
; GCN-NOT: [[VAL]]
; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[CONV]]
define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
- %val = load i32, i32 addrspace(1)* %in
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %gep
%and = and i32 %val, 255
%cvt = uitofp i32 %and to float
store float %cvt, float addrspace(1)* %out
@@ -211,12 +235,14 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out
}
; GCN-LABEL: {{^}}extract_byte1_to_f32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
; GCN-NOT: [[VAL]]
; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[CONV]]
define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
- %val = load i32, i32 addrspace(1)* %in
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %gep
%srl = lshr i32 %val, 8
%and = and i32 %srl, 255
%cvt = uitofp i32 %and to float
@@ -225,12 +251,14 @@ define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out
}
; GCN-LABEL: {{^}}extract_byte2_to_f32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
; GCN-NOT: [[VAL]]
; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[CONV]]
define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
- %val = load i32, i32 addrspace(1)* %in
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %gep
%srl = lshr i32 %val, 16
%and = and i32 %srl, 255
%cvt = uitofp i32 %and to float
@@ -239,12 +267,14 @@ define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out
}
; GCN-LABEL: {{^}}extract_byte3_to_f32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
; GCN-NOT: [[VAL]]
; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[CONV]]
define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
- %val = load i32, i32 addrspace(1)* %in
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+ %val = load i32, i32 addrspace(1)* %gep
%srl = lshr i32 %val, 24
%and = and i32 %srl, 255
%cvt = uitofp i32 %and to float
diff --git a/test/CodeGen/AMDGPU/detect-dead-lanes.mir b/test/CodeGen/AMDGPU/detect-dead-lanes.mir
index 3148b9b8ff9d..c265b8e2ad2e 100644
--- a/test/CodeGen/AMDGPU/detect-dead-lanes.mir
+++ b/test/CodeGen/AMDGPU/detect-dead-lanes.mir
@@ -1,14 +1,4 @@
# RUN: llc -march=amdgcn -run-pass detect-dead-lanes -o - %s | FileCheck %s
---- |
- define amdgpu_kernel void @test0() { ret void }
- define amdgpu_kernel void @test1() { ret void }
- define amdgpu_kernel void @test2() { ret void }
- define amdgpu_kernel void @test3() { ret void }
- define amdgpu_kernel void @test4() { ret void }
- define amdgpu_kernel void @test5() { ret void }
- define amdgpu_kernel void @loop0() { ret void }
- define amdgpu_kernel void @loop1() { ret void }
- define amdgpu_kernel void @loop2() { ret void }
...
---
# Combined use/def transfer check, the basics.
diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll
index 2c474dbe7b08..deb90df99dcf 100644
--- a/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/test/CodeGen/AMDGPU/ds_read2.ll
@@ -9,7 +9,7 @@
; SI-LABEL: @simple_read2_f32
; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8
; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 {
@@ -28,7 +28,7 @@ define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 {
; SI-LABEL: @simple_read2_f32_max_offset
; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255
; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
diff --git a/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/test/CodeGen/AMDGPU/ds_read2_superreg.ll
index 3dfdaf3936a6..ef4efc6336ce 100644
--- a/test/CodeGen/AMDGPU/ds_read2_superreg.ll
+++ b/test/CodeGen/AMDGPU/ds_read2_superreg.ll
@@ -38,9 +38,9 @@ define amdgpu_kernel void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)*
; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align4:
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Z:[0-9]+]]:[[REG_W:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
-; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]]
-; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[REG_Y]]
-; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]]
+; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_X]], v[[REG_Z]]
+; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[REG_W]]
+; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD0]], v[[ADD1]]
; CI: buffer_store_dword v[[ADD2]]
; CI: s_endpgm
define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
@@ -64,8 +64,8 @@ define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(float addrspace(1)
; CI-LABEL: {{^}}simple_read2_v3f32_superreg_align4:
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
; CI-DAG: ds_read_b32 v[[REG_Z:[0-9]+]], v{{[0-9]+}} offset:8{{$}}
-; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]]
-; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]]
+; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_X]], v[[REG_Z]]
+; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[ADD0]], v[[REG_Y]]
; CI: buffer_store_dword v[[ADD1]]
; CI: s_endpgm
define amdgpu_kernel void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
diff --git a/test/CodeGen/AMDGPU/ds_read2st64.ll b/test/CodeGen/AMDGPU/ds_read2st64.ll
index 81b35a46aa18..b1fba8c240d7 100644
--- a/test/CodeGen/AMDGPU/ds_read2st64.ll
+++ b/test/CodeGen/AMDGPU/ds_read2st64.ll
@@ -7,7 +7,7 @@
; SI-LABEL: @simple_read2st64_f32_0_1
; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
@@ -26,7 +26,7 @@ define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0
; SI-LABEL: @simple_read2st64_f32_1_2
; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
@@ -46,7 +46,7 @@ define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, fl
; SI-LABEL: @simple_read2st64_f32_max_offset
; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255
; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
diff --git a/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/test/CodeGen/AMDGPU/early-if-convert-cost.ll
index ace01593808b..74404989f8c7 100644
--- a/test/CodeGen/AMDGPU/early-if-convert-cost.ll
+++ b/test/CodeGen/AMDGPU/early-if-convert-cost.ll
@@ -1,4 +1,4 @@
-; RUN: llc -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; FIXME: Most of these cases that don't trigger because of broken cost
; heuristics. Should not need -stress-early-ifcvt
diff --git a/test/CodeGen/AMDGPU/early-if-convert.ll b/test/CodeGen/AMDGPU/early-if-convert.ll
index 9439130deb9e..792f0b1eaef4 100644
--- a/test/CodeGen/AMDGPU/early-if-convert.ll
+++ b/test/CodeGen/AMDGPU/early-if-convert.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; FIXME: This leaves behind a now unnecessary and with exec
diff --git a/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll b/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
index 6eb1fc1d0cc2..b7dfcd99029a 100644
--- a/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
+++ b/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
@@ -2,16 +2,21 @@
; RUN: llc -march=amdgcn -enable-no-signed-zeros-fp-math=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-UNSAFE %s
; RUN: llc -march=amdgcn -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-UNSAFE %s
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+
; Test that the -enable-no-signed-zeros-fp-math flag works
; GCN-LABEL: {{^}}fneg_fsub_f32:
-; GCN: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
+; GCN: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
; GCN-UNSAFE-NOT: xor
define amdgpu_kernel void @fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
- %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
- %a = load float, float addrspace(1)* %in, align 4
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %add = add i32 %tid, 1
+ %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
+ %b_ptr = getelementptr float, float addrspace(1)* %in, i32 %add
+ %a = load float, float addrspace(1)* %gep, align 4
%b = load float, float addrspace(1)* %b_ptr, align 4
%result = fsub float %a, %b
%neg.result = fsub float -0.0, %result
diff --git a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
index 34999fa3aea4..3fb452de1ccf 100644
--- a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
+++ b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
@@ -1,5 +1,7 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+
; Make sure the add and load are reduced to 32-bits even with the
; bitcast to vector.
; GCN-LABEL: {{^}}bitcast_int_to_vector_extract_0:
@@ -8,7 +10,9 @@
; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, [[B]], [[A]]
; GCN: buffer_store_dword [[ADD]]
define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
- %a = load i64, i64 addrspace(1)* %in
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+ %a = load i64, i64 addrspace(1)* %gep
%add = add i64 %a, %b
%val.bc = bitcast i64 %add to <2 x i32>
%extract = extractelement <2 x i32> %val.bc, i32 0
@@ -21,7 +25,9 @@ define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %ou
; GCN: v_add_f64
; GCN: buffer_store_dword v
define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) {
- %a = load double, double addrspace(1)* %in
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+ %a = load double, double addrspace(1)* %gep
%add = fadd double %a, %b
%val.bc = bitcast double %add to <2 x i32>
%extract = extractelement <2 x i32> %val.bc, i32 0
@@ -34,7 +40,9 @@ define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out
; GCN: v_add_i32
; GCN: buffer_store_dword
define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
- %a = load i64, i64 addrspace(1)* %in
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+ %a = load i64, i64 addrspace(1)* %gep
%add = add i64 %a, %b
%val.bc = bitcast i64 %add to <2 x float>
%extract = extractelement <2 x float> %val.bc, i32 0
diff --git a/test/CodeGen/AMDGPU/fabs.f16.ll b/test/CodeGen/AMDGPU/fabs.f16.ll
index 4e2ec4b3054f..d56d5ec1411a 100644
--- a/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -39,9 +39,9 @@ define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) {
; VI: flat_load_ushort [[HI:v[0-9]+]]
; VI: flat_load_ushort [[LO:v[0-9]+]]
; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
-; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[HI]]
+; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[HI]], [[MASK]]
; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[LO]], [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_HI]], [[FABS_LO]]
+; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_LO]], [[FABS_HI]]
; VI: flat_store_dword
; GFX9: s_load_dword [[VAL:s[0-9]+]]
@@ -62,8 +62,8 @@ define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half
; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
-; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
+; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
@@ -80,7 +80,7 @@ define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half
; CI-DAG: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[IN0]]
; CI-DAG: v_cvt_f32_f16_e64 [[ABS_CVT1:v[0-9]+]], |[[IN1]]|
-; CI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[CVT0]], [[ABS_CVT1]]
+; CI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[ABS_CVT1]], [[CVT0]]
; CI: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]]
; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]]
@@ -134,7 +134,9 @@ define amdgpu_kernel void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %i
; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]]
; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}}
define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
- %val = load <2 x half>, <2 x half> addrspace(1)* %in
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
+ %val = load <2 x half>, <2 x half> addrspace(1)* %gep
%fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
%fmul = fmul <2 x half> %fabs, %val
store <2 x half> %fmul, <2 x half> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
index 9edf55cbc69f..0c4a77964d15 100644
--- a/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
+++ b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
@@ -16,8 +16,8 @@
; GCN: buffer_load_dword [[U:v[0-9]+]]
; GCN: buffer_load_dword [[V:v[0-9]+]]
-; GCN-FLUSH: v_mac_f32_e32 [[Z]], [[V]], [[U]]
-; GCN-FLUSH-NEXT: v_mac_f32_e32 [[Z]], [[Y]], [[X]]
+; GCN-FLUSH: v_mac_f32_e32 [[Z]], [[U]], [[V]]
+; GCN-FLUSH-NEXT: v_mac_f32_e32 [[Z]], [[X]], [[Y]]
; GCN-FLUSH-NEXT: buffer_store_dword [[Z]]
; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], [[Z]]
@@ -49,7 +49,7 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul() #0 {
; GCN: buffer_load_dword [[V:v[0-9]+]]
; GCN-FLUSH: v_mad_f32 [[TMP:v[0-9]]], [[U]], [[V]], -[[Z]]
-; GCN-FLUSH-NEXT: v_mac_f32_e32 [[TMP]], [[Y]], [[X]]
+; GCN-FLUSH-NEXT: v_mac_f32_e32 [[TMP]], [[X]], [[Y]]
; GCN-FLUSH-NEXT: buffer_store_dword [[Z]]
; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]]
@@ -75,13 +75,13 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul() #0 {
; GCN: buffer_load_dword [[U:v[0-9]+]]
; GCN: buffer_load_dword [[V:v[0-9]+]]
-; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
-; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[Y]], [[X]]
-; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[U]]
+; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
+; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
+; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[U]], [[Z]]
-; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
-; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[FMA1]]
+; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[FMA1]], [[Z]]
; GCN-SLOWFMA: v_mul_f32_e32
; GCN-SLOWFMA: v_mul_f32_e32
@@ -108,13 +108,13 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
; GCN: buffer_load_dword [[U:v[0-9]+]]
; GCN: buffer_load_dword [[V:v[0-9]+]]
-; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
-; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[Y]], [[X]]
-; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[U]], [[Z]]
+; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
+; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
+; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[U]]
-; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
-; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[FMA1]], [[Z]]
+; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[FMA1]]
; GCN-SLOWFMA: v_mul_f32_e32
; GCN-SLOWFMA: v_mul_f32_e32
@@ -191,17 +191,17 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0
; GCN: buffer_load_dword [[U:v[0-9]+]]
; GCN: buffer_load_dword [[V:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
-; GCN-FLUSH: v_mad_f32 [[MAD:v[0-9]+]], [[Y]], [[X]], [[MUL]]
-; GCN-FLUSH: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MAD]]
+; GCN-FLUSH: v_mad_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]]
+; GCN-FLUSH: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]]
; GCN-FASTFMA: v_fma_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]]
-; GCN-FASTFMA: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MAD]]
+; GCN-FASTFMA: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]]
-; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]]
+; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
; GCN-SLOWFMA: v_add_f32_e32
-; GCN-SLOWFMA: v_subrev_f32_e32 [[MAD:v[0-9]+]]
+; GCN-SLOWFMA: v_sub_f32_e32 [[MAD:v[0-9]+]]
; GCN: buffer_store_dword [[MUL]]
; GCN: buffer_store_dword [[MAD]]
@@ -226,21 +226,21 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
; GCN: buffer_load_dword [[U:v[0-9]+]]
; GCN: buffer_load_dword [[V:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
-; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[Y]], [[X]]
-; GCN-FLUSH-NEXT: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MUL]]
+; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
+; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MUL]], [[Z]]
; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]]
-; GCN-FASTFMA-NEXT: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]]
+; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]]
; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
-; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]]
+; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
; GCN-SLOWFMA: v_add_f32_e32
-; GCN-SLOWFMA: v_subrev_f32_e32
+; GCN-SLOWFMA: v_sub_f32_e32
define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
diff --git a/test/CodeGen/AMDGPU/fadd.f16.ll b/test/CodeGen/AMDGPU/fadd.f16.ll
index 08199be144f4..88b3be0e0d31 100644
--- a/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -2,13 +2,13 @@
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}fadd_f16
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
+; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
+; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fadd_f16(
@@ -24,7 +24,7 @@ entry:
}
; GCN-LABEL: {{^}}fadd_f16_imm_a
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], 1.0, v[[B_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
@@ -42,7 +42,7 @@ entry:
}
; GCN-LABEL: {{^}}fadd_f16_imm_b
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], 2.0, v[[A_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
@@ -60,8 +60,8 @@ entry:
}
; GCN-LABEL: {{^}}fadd_v2f16:
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
@@ -70,16 +70,16 @@ entry:
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
@@ -88,15 +88,18 @@ define amdgpu_kernel void @fadd_v2f16(
<2 x half> addrspace(1)* %a,
<2 x half> addrspace(1)* %b) {
entry:
- %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
- %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.a = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %a, i32 %tid
+ %gep.b = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %b, i32 %tid
+ %a.val = load <2 x half>, <2 x half> addrspace(1)* %gep.a
+ %b.val = load <2 x half>, <2 x half> addrspace(1)* %gep.b
%r.val = fadd <2 x half> %a.val, %b.val
store <2 x half> %r.val, <2 x half> addrspace(1)* %r
ret void
}
; GCN-LABEL: {{^}}fadd_v2f16_imm_a:
-; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
@@ -105,12 +108,12 @@ entry:
; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_mov_b32_e32 v[[CONST2:[0-9]+]], 0x4000
; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
@@ -118,14 +121,16 @@ define amdgpu_kernel void @fadd_v2f16_imm_a(
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %b) {
entry:
- %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.b = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %b, i32 %tid
+ %b.val = load <2 x half>, <2 x half> addrspace(1)* %gep.b
%r.val = fadd <2 x half> <half 1.0, half 2.0>, %b.val
store <2 x half> %r.val, <2 x half> addrspace(1)* %r
ret void
}
; GCN-LABEL: {{^}}fadd_v2f16_imm_b:
-; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
@@ -134,12 +139,12 @@ entry:
; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
; VI-DAG: v_add_f16_sdwa v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[A_V2_F16]]
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
@@ -147,8 +152,15 @@ define amdgpu_kernel void @fadd_v2f16_imm_b(
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %a) {
entry:
- %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.a = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %a, i32 %tid
+ %a.val = load <2 x half>, <2 x half> addrspace(1)* %gep.a
%r.val = fadd <2 x half> %a.val, <half 2.0, half 1.0>
store <2 x half> %r.val, <2 x half> addrspace(1)* %r
ret void
}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fadd64.ll b/test/CodeGen/AMDGPU/fadd64.ll
index c936d98673ba..8fd1f52006fb 100644
--- a/test/CodeGen/AMDGPU/fadd64.ll
+++ b/test/CodeGen/AMDGPU/fadd64.ll
@@ -5,8 +5,11 @@
; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
define amdgpu_kernel void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2) {
- %r0 = load double, double addrspace(1)* %in1
- %r1 = load double, double addrspace(1)* %in2
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep1 = getelementptr inbounds double, double addrspace(1)* %in1, i32 %tid
+ %gep2 = getelementptr inbounds double, double addrspace(1)* %in2, i32 %tid
+ %r0 = load double, double addrspace(1)* %gep1
+ %r1 = load double, double addrspace(1)* %gep2
%r2 = fadd double %r0, %r1
store double %r2, double addrspace(1)* %out
ret void
@@ -42,3 +45,8 @@ define amdgpu_kernel void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x do
store <2 x double> %r2, <2 x double> addrspace(1)* %out
ret void
}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
new file mode 100644
index 000000000000..5383bbe71ae3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -0,0 +1,487 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GCN-FLUSH %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals,+fp-exceptions < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-EXCEPT -check-prefix=VI -check-prefix=GCN-FLUSH %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-DENORM %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GCN-FLUSH %s
+
+; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32:
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %v = load float, float addrspace(1)* %gep, align 4
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_value_f32:
+; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = fmul float %load, 15.0
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_sub_value_f32:
+; GCN: v_sub_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = fsub float 15.0, %load
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_add_value_f32:
+; GCN: v_add_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = fadd float %load, 15.0
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_sqrt_value_f32:
+; GCN: v_sqrt_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = call float @llvm.sqrt.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fceil_value_f32:
+; GCN: v_ceil_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = call float @llvm.ceil.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_floor_value_f32:
+; GCN: v_floor_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = call float @llvm.floor.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fma_value_f32:
+; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = call float @llvm.fma.f32(float %load, float 15.0, float 15.0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32:
+; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = call float @llvm.fmuladd.f32(float %load, float 15.0, float 15.0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_canonicalize_value_f32:
+; GCN: flat_load_dword [[LOAD:v[0-9]+]],
+; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]]
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = call float @llvm.canonicalize.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fpextend_value_f64_f32:
+; GCN: v_cvt_f64_f32_e32 [[V:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
+; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(float addrspace(1)* %arg, double addrspace(1)* %out) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = fpext float %load to double
+ %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
+ %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
+ store double %canonicalized, double addrspace(1)* %gep2, align 8
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16:
+; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(half addrspace(1)* %arg, float addrspace(1)* %out) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
+ %load = load half, half addrspace(1)* %gep, align 2
+ %v = fpext half %load to float
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
+ store float %canonicalized, float addrspace(1)* %gep2, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fpround_value_f32_f64:
+; GCN: v_cvt_f32_f64_e32 [[V:v[0-9]+]], v[{{[0-9:]+}}]
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(double addrspace(1)* %arg, float addrspace(1)* %out) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
+ %load = load double, double addrspace(1)* %gep, align 8
+ %v = fptrunc double %load to float
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
+ store float %canonicalized, float addrspace(1)* %gep2, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32:
+; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_short v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(float addrspace(1)* %arg, half addrspace(1)* %out) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = fptrunc float %load to half
+ %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
+ %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
+ store half %canonicalized, half addrspace(1)* %gep2, align 2
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32:
+; GCN-DAG: v_cvt_f16_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
+; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}}
+; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V0]], [[V1]]
+; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}}
+; GFX9: v_and_b32_e32 [[V0_16:v[0-9]+]], 0xffff, [[V0]]
+; GFX9: v_lshl_or_b32 [[V:v[0-9]+]], [[V1]], 16, [[V0_16]]
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x float> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arg, i32 %id
+ %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+ %v = fptrunc <2 x float> %load to <2 x half>
+ %canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v)
+ %gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i32 %id
+ store <2 x half> %canonicalized, <2 x half> addrspace(1)* %gep2, align 4
+ ret void
+}
+
+; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32:
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, 1.0, -v{{[0-9]+}}
+define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = fsub float -0.0, %load
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fneg_value_f32:
+; GCN: v_xor_b32_e32 [[V:v[0-9]+]], 0x80000000, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v0 = fadd float %load, 0.0
+ %v = fsub float -0.0, %v0
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_no_fold_canonicalize_fabs_value_f32:
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
+define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.fabs.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fabs_value_f32:
+; GCN: v_and_b32_e32 [[V:v[0-9]+]], 0x7fffffff, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v0 = fadd float %load, 0.0
+ %v = tail call float @llvm.fabs.f32(float %v0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_sin_value_f32:
+; GCN: v_sin_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.sin.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_cos_value_f32:
+; GCN: v_cos_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.cos.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_sin_value_f16:
+; GCN: v_sin_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
+; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
+; GCN: flat_store_short v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(half addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
+ %load = load half, half addrspace(1)* %gep, align 2
+ %v = tail call half @llvm.sin.f16(half %load)
+ %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
+ store half %canonicalized, half addrspace(1)* %gep, align 2
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_cos_value_f16:
+; GCN: v_cos_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
+; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
+; GCN: flat_store_short v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(half addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
+ %load = load half, half addrspace(1)* %gep, align 2
+ %v = tail call half @llvm.cos.f16(half %load)
+ %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
+ store half %canonicalized, half addrspace(1)* %gep, align 2
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_qNaN_value_f32:
+; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x7fc00000
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %canonicalized = tail call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32:
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_minnum_value_f32:
+; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v0 = fadd float %load, 0.0
+ %v = tail call float @llvm.minnum.f32(float %v0, float 0.0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32:
+; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7f800001, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 2139095041 to float))
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
+; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32:
+; GCN: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.maxnum.f32(float %load, float 0.0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_maxnum_value_f32:
+; GCN: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v0 = fadd float %load, 0.0
+ %v = tail call float @llvm.maxnum.f32(float %v0, float 0.0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_maxnum_value_f64:
+; GCN: v_max_f64 [[V:v\[[0-9]+:[0-9]+\]]], v[{{[0-9:]+}}], 0
+; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(double addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
+ %load = load double, double addrspace(1)* %gep, align 8
+ %v0 = fadd double %load, 0.0
+ %v = tail call double @llvm.maxnum.f64(double %v0, double 0.0)
+ %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
+ store double %canonicalized, double addrspace(1)* %gep, align 8
+ ret void
+}
+
+; GCN-LABEL: test_no_fold_canonicalize_fmul_value_f32_no_ieee:
+; GCN-EXCEPT: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+define amdgpu_ps float @test_no_fold_canonicalize_fmul_value_f32_no_ieee(float %arg) {
+entry:
+ %v = fmul float %arg, 15.0
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ ret float %canonicalized
+}
+
+; GCN-LABEL: test_fold_canonicalize_fmul_nnan_value_f32_no_ieee:
+; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
+; GCN-NEXT: ; return
+; GCN-NOT: 1.0
+define amdgpu_ps float @test_fold_canonicalize_fmul_nnan_value_f32_no_ieee(float %arg) {
+entry:
+ %v = fmul nnan float %arg, 15.0
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ ret float %canonicalized
+}
+
+declare float @llvm.canonicalize.f32(float) #0
+declare double @llvm.canonicalize.f64(double) #0
+declare half @llvm.canonicalize.f16(half) #0
+declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare float @llvm.sqrt.f32(float) #0
+declare float @llvm.ceil.f32(float) #0
+declare float @llvm.floor.f32(float) #0
+declare float @llvm.fma.f32(float, float, float) #0
+declare float @llvm.fmuladd.f32(float, float, float) #0
+declare float @llvm.fabs.f32(float) #0
+declare float @llvm.sin.f32(float) #0
+declare float @llvm.cos.f32(float) #0
+declare half @llvm.sin.f16(half) #0
+declare half @llvm.cos.f16(half) #0
+declare float @llvm.minnum.f32(float, float) #0
+declare float @llvm.maxnum.f32(float, float) #0
+declare double @llvm.maxnum.f64(double, double) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 404358f0ecb9..dd8e277c1c75 100644
--- a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -5,6 +5,8 @@ declare half @llvm.fabs.f16(half) #0
declare half @llvm.canonicalize.f16(half) #0
declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0
declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
; GCN-LABEL: {{^}}v_test_canonicalize_var_f16:
; GCN: v_mul_f16_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
@@ -213,7 +215,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace
; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}}
; GFX9: buffer_store_dword [[REG]]
define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
- %val = load <2 x half>, <2 x half> addrspace(1)* %out
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+ %val = load <2 x half>, <2 x half> addrspace(1)* %gep
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
@@ -233,7 +237,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)
; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]]{{$}}
; GCN: buffer_store_dword
define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
- %val = load <2 x half>, <2 x half> addrspace(1)* %out
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+ %val = load <2 x half>, <2 x half> addrspace(1)* %gep
%val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -251,7 +257,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspa
; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]] neg_lo:[0,1] neg_hi:[0,1]{{$}}
; GCN: buffer_store_dword
define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
- %val = load <2 x half>, <2 x half> addrspace(1)* %out
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+ %val = load <2 x half>, <2 x half> addrspace(1)* %gep
%val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
%val.fabs.fneg = fsub <2 x half> <half -0.0, half -0.0>, %val.fabs
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs.fneg)
@@ -270,7 +278,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> ad
; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}}
; GFX9: buffer_store_dword [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
- %val = load <2 x half>, <2 x half> addrspace(1)* %out
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+ %val = load <2 x half>, <2 x half> addrspace(1)* %gep
%fneg.val = fsub <2 x half> <half -0.0, half -0.0>, %val
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fcanonicalize.ll b/test/CodeGen/AMDGPU/fcanonicalize.ll
index 8c385f40b1c5..feb4c7bd4a18 100644
--- a/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
declare float @llvm.fabs.f32(float) #0
declare float @llvm.canonicalize.f32(float) #0
diff --git a/test/CodeGen/AMDGPU/fcmp.f16.ll b/test/CodeGen/AMDGPU/fcmp.f16.ll
index 7916226462f7..aef898b1a8ee 100644
--- a/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}fcmp_f16_lt
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
@@ -351,23 +351,12 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_lt
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_lt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_lt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_lt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; GCN-LABEL: {{^}}fcmp_v2f16_lt:
+; SI: v_cmp_lt_f32_e32 vcc,
+; SI: v_cmp_lt_f32_e32 vcc,
+
+; VI: v_cmp_lt_f16_e32 vcc,
+; VI: v_cmp_lt_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_lt(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -382,22 +371,11 @@ entry:
}
; GCN-LABEL: {{^}}fcmp_v2f16_eq
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_eq_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_eq_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_eq_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_eq_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; SI: v_cmp_eq_f32_e32 vcc,
+; SI: v_cmp_eq_f32_e32 vcc,
+
+; VI: v_cmp_eq_f16_e32 vcc,
+; VI: v_cmp_eq_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_eq(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -411,23 +389,11 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_le
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_le_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_le_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_le_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_le_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; GCN-LABEL: {{^}}fcmp_v2f16_le:
+; SI: v_cmp_le_f32_e32 vcc
+; SI: v_cmp_le_f32_e32 vcc
+; VI: v_cmp_le_f16_e32 vcc
+; VI: v_cmp_le_f16_e32 vcc
define amdgpu_kernel void @fcmp_v2f16_le(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -441,23 +407,12 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_gt
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_gt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_gt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_gt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_gt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; GCN-LABEL: {{^}}fcmp_v2f16_gt:
+; SI: v_cmp_gt_f32_e32 vcc,
+; SI: v_cmp_gt_f32_e32 vcc,
+
+; VI: v_cmp_gt_f16_e32 vcc,
+; VI: v_cmp_gt_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_gt(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -471,23 +426,12 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_lg
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_lg_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_lg_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_lg_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_lg_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; GCN-LABEL: {{^}}fcmp_v2f16_lg:
+; SI: v_cmp_lg_f32_e32 vcc,
+; SI: v_cmp_lg_f32_e32 vcc,
+
+; VI: v_cmp_lg_f16_e32 vcc,
+; VI: v_cmp_lg_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_lg(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -501,23 +445,12 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_ge
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_ge_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_ge_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_ge_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_ge_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; GCN-LABEL: {{^}}fcmp_v2f16_ge:
+; SI: v_cmp_ge_f32_e32 vcc,
+; SI: v_cmp_ge_f32_e32 vcc,
+
+; VI: v_cmp_ge_f16_e32 vcc,
+; VI: v_cmp_ge_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_ge(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -531,23 +464,12 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_o
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_o_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_o_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_o_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_o_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; GCN-LABEL: {{^}}fcmp_v2f16_o:
+; SI: v_cmp_o_f32_e32 vcc,
+; SI: v_cmp_o_f32_e32 vcc,
+
+; VI: v_cmp_o_f16_e32 vcc,
+; VI: v_cmp_o_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_o(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -561,23 +483,12 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}fcmp_v2f16_u
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_u_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_u_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_u_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_u_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; GCN-LABEL: {{^}}fcmp_v2f16_u:
+; SI: v_cmp_u_f32_e32 vcc,
+; SI: v_cmp_u_f32_e32 vcc,
+
+; VI: v_cmp_u_f16_e32 vcc,
+; VI: v_cmp_u_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_u(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -592,22 +503,11 @@ entry:
}
; GCN-LABEL: {{^}}fcmp_v2f16_nge
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_nge_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_nge_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_nge_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_nge_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; SI: v_cmp_nge_f32_e32 vcc,
+; SI: v_cmp_nge_f32_e32 vcc,
+
+; VI: v_cmp_nge_f16_e32 vcc,
+; VI: v_cmp_nge_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_nge(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -622,22 +522,11 @@ entry:
}
; GCN-LABEL: {{^}}fcmp_v2f16_nlg
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_nlg_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_nlg_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_nlg_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_nlg_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; SI: v_cmp_nlg_f32_e32 vcc
+; SI: v_cmp_nlg_f32_e32 vcc
+
+; VI: v_cmp_nlg_f16_e32 vcc
+; VI: v_cmp_nlg_f16_e32 vcc
define amdgpu_kernel void @fcmp_v2f16_nlg(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -652,22 +541,11 @@ entry:
}
; GCN-LABEL: {{^}}fcmp_v2f16_ngt
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_ngt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_ngt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_ngt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_ngt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; SI: v_cmp_ngt_f32_e32 vcc,
+; SI: v_cmp_ngt_f32_e32 vcc,
+
+; VI: v_cmp_ngt_f16_e32 vcc,
+; VI: v_cmp_ngt_f16_e32 vcc,
define amdgpu_kernel void @fcmp_v2f16_ngt(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -682,22 +560,11 @@ entry:
}
; GCN-LABEL: {{^}}fcmp_v2f16_nle
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_nle_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_nle_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_nle_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_nle_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
+
+; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @fcmp_v2f16_nle(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -712,22 +579,11 @@ entry:
}
; GCN-LABEL: {{^}}fcmp_v2f16_neq
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_neq_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_neq_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_neq_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_neq_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
-; GCN: s_endpgm
+; SI: v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; SI: v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
+
+; VI: v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @fcmp_v2f16_neq(
<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@@ -744,17 +600,19 @@ entry:
; GCN-LABEL: {{^}}fcmp_v2f16_nlt
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
-; SI: v_cmp_nlt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
-; VI: v_cmp_nlt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
-; VI: v_cmp_nlt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; GCN-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
+
+; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_1]], v[[B_F32_1]]
+; VI-DAG: v_cmp_nlt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
+
+; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16_1]], v[[B_F16_1]]
; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/fcmp64.ll b/test/CodeGen/AMDGPU/fcmp64.ll
index b9e1921d4c45..95f7e0be7d9c 100644
--- a/test/CodeGen/AMDGPU/fcmp64.ll
+++ b/test/CodeGen/AMDGPU/fcmp64.ll
@@ -2,7 +2,7 @@
; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
; CHECK-LABEL: {{^}}flt_f64:
-; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_nge_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2) {
%r0 = load double, double addrspace(1)* %in1
@@ -14,7 +14,7 @@ define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)*
}
; CHECK-LABEL: {{^}}fle_f64:
-; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_ngt_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2) {
%r0 = load double, double addrspace(1)* %in1
@@ -26,7 +26,7 @@ define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)*
}
; CHECK-LABEL: {{^}}fgt_f64:
-; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_nle_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2) {
%r0 = load double, double addrspace(1)* %in1
@@ -38,7 +38,7 @@ define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)*
}
; CHECK-LABEL: {{^}}fge_f64:
-; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_nlt_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2) {
%r0 = load double, double addrspace(1)* %in1
@@ -50,7 +50,7 @@ define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)*
}
; CHECK-LABEL: {{^}}fne_f64:
-; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_neq_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2) {
%r0 = load double, double addrspace(1)* %in1
@@ -62,7 +62,7 @@ define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1
}
; CHECK-LABEL: {{^}}feq_f64:
-; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_nlg_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
define amdgpu_kernel void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2) {
%r0 = load double, double addrspace(1)* %in1
diff --git a/test/CodeGen/AMDGPU/fconst64.ll b/test/CodeGen/AMDGPU/fconst64.ll
index 125597796245..ca313d80894a 100644
--- a/test/CodeGen/AMDGPU/fconst64.ll
+++ b/test/CodeGen/AMDGPU/fconst64.ll
@@ -6,8 +6,15 @@
; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0
define amdgpu_kernel void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
- %r1 = load double, double addrspace(1)* %in
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds double, double addrspace(1)* %in, i32 %tid
+ %r1 = load double, double addrspace(1)* %gep
%r2 = fadd double %r1, 5.000000e+00
store double %r2, double addrspace(1)* %out
ret void
}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fcopysign.f16.ll b/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 4e2bf765cd95..8e984246cc94 100644
--- a/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX8 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX8 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
declare half @llvm.copysign.f16(half, half)
declare float @llvm.copysign.f32(float, float)
@@ -9,16 +9,18 @@ declare <2 x half> @llvm.copysign.v2f16(<2 x half>, <2 x half>)
declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>)
declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>)
+declare i32 @llvm.amdgcn.workitem.id.x()
+
; GCN-LABEL: {{^}}test_copysign_f16:
-; SI: buffer_load_ushort v[[SIGN:[0-9]+]]
-; SI: buffer_load_ushort v[[MAG:[0-9]+]]
+; SI: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
+; SI: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
; SI: s_brev_b32 s[[CONST:[0-9]+]], -2
; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_F32]]
; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]]
-; GFX89: buffer_load_ushort v[[SIGN:[0-9]+]]
-; GFX89: buffer_load_ushort v[[MAG:[0-9]+]]
+; GFX89: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
+; GFX89: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
; GFX89: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff
; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]]
; GCN: buffer_store_short v[[OUT]]
@@ -36,8 +38,8 @@ entry:
}
; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f16_sign_f32:
-; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
-; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dword v[[SIGN:[0-9]+]]
; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]]
; GCN: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_EXT]], v[[SIGN]]
@@ -48,17 +50,20 @@ define amdgpu_kernel void @test_copysign_out_f32_mag_f16_sign_f32(
half addrspace(1)* %arg_mag,
float addrspace(1)* %arg_sign) {
entry:
- %mag = load half, half addrspace(1)* %arg_mag
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
+ %mag = load half, half addrspace(1)* %arg_mag_gep
%mag.ext = fpext half %mag to float
- %sign = load float, float addrspace(1)* %arg_sign
+ %arg_sign_gep = getelementptr float, float addrspace(1)* %arg_sign, i32 %tid
+ %sign = load float, float addrspace(1)* %arg_sign_gep
%out = call float @llvm.copysign.f32(float %mag.ext, float %sign)
store float %out, float addrspace(1)* %arg_out
ret void
}
; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f16_sign_f64:
-; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
-; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]]
; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[MAG_EXT_LO:[0-9]+]]:[[MAG_EXT_HI:[0-9]+]]{{\]}}, v[[MAG_EXT]]
@@ -70,17 +75,20 @@ define amdgpu_kernel void @test_copysign_out_f64_mag_f16_sign_f64(
half addrspace(1)* %arg_mag,
double addrspace(1)* %arg_sign) {
entry:
- %mag = load half, half addrspace(1)* %arg_mag
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
+ %mag = load half, half addrspace(1)* %arg_mag_gep
%mag.ext = fpext half %mag to double
- %sign = load double, double addrspace(1)* %arg_sign
+ %arg_sign_gep = getelementptr double, double addrspace(1)* %arg_sign, i32 %tid
+ %sign = load double, double addrspace(1)* %arg_sign_gep
%out = call double @llvm.copysign.f64(double %mag.ext, double %sign)
store double %out, double addrspace(1)* %arg_out
ret void
}
; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f32_sign_f16:
-; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]]
-; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dword v[[MAG:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
; SI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_F32]]
@@ -93,8 +101,11 @@ define amdgpu_kernel void @test_copysign_out_f32_mag_f32_sign_f16(
float addrspace(1)* %arg_mag,
half addrspace(1)* %arg_sign) {
entry:
- %mag = load float, float addrspace(1)* %arg_mag
- %sign = load half, half addrspace(1)* %arg_sign
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %arg_mag_gep = getelementptr float, float addrspace(1)* %arg_mag, i32 %tid
+ %mag = load float, float addrspace(1)* %arg_mag_gep
+ %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid
+ %sign = load half, half addrspace(1)* %arg_sign_gep
%sign.ext = fpext half %sign to float
%out = call float @llvm.copysign.f32(float %mag, float %sign.ext)
store float %out, float addrspace(1)* %arg_out
@@ -102,8 +113,8 @@ entry:
}
; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f64_sign_f16:
-; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}}
-; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}}
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
; SI: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_F32]]
@@ -116,8 +127,11 @@ define amdgpu_kernel void @test_copysign_out_f64_mag_f64_sign_f16(
double addrspace(1)* %arg_mag,
half addrspace(1)* %arg_sign) {
entry:
- %mag = load double, double addrspace(1)* %arg_mag
- %sign = load half, half addrspace(1)* %arg_sign
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %arg_mag_gep = getelementptr double, double addrspace(1)* %arg_mag, i32 %tid
+ %mag = load double, double addrspace(1)* %arg_mag_gep
+ %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid
+ %sign = load half, half addrspace(1)* %arg_sign_gep
%sign.ext = fpext half %sign to double
%out = call double @llvm.copysign.f64(double %mag, double %sign.ext)
store double %out, double addrspace(1)* %arg_out
@@ -125,8 +139,8 @@ entry:
}
; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f32:
-; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
-; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dword v[[SIGN:[0-9]+]]
; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN]]
@@ -141,8 +155,11 @@ define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f32(
half addrspace(1)* %arg_mag,
float addrspace(1)* %arg_sign) {
entry:
- %mag = load half, half addrspace(1)* %arg_mag
- %sign = load float, float addrspace(1)* %arg_sign
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
+ %mag = load half, half addrspace(1)* %arg_mag_gep
+ %arg_sign_gep = getelementptr float, float addrspace(1)* %arg_sign, i32 %tid
+ %sign = load float, float addrspace(1)* %arg_sign_gep
%sign.trunc = fptrunc float %sign to half
%out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
store half %out, half addrspace(1)* %arg_out
@@ -150,8 +167,8 @@ entry:
}
; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f64:
-; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
-; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_HI]]
@@ -166,8 +183,11 @@ define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f64(
half addrspace(1)* %arg_mag,
double addrspace(1)* %arg_sign) {
entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
%mag = load half, half addrspace(1)* %arg_mag
- %sign = load double, double addrspace(1)* %arg_sign
+ %arg_sign_gep = getelementptr double, double addrspace(1)* %arg_sign, i32 %tid
+ %sign = load double, double addrspace(1)* %arg_sign_gep
%sign.trunc = fptrunc double %sign to half
%out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
store half %out, half addrspace(1)* %arg_out
@@ -175,8 +195,8 @@ entry:
}
; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f32_sign_f16:
-; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]]
-; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dword v[[MAG:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
; SI-DAG: v_cvt_f16_f32_e32 v[[MAG_TRUNC:[0-9]+]], v[[MAG]]
; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
@@ -193,9 +213,12 @@ define amdgpu_kernel void @test_copysign_out_f16_mag_f32_sign_f16(
float addrspace(1)* %arg_mag,
half addrspace(1)* %arg_sign) {
entry:
- %mag = load float, float addrspace(1)* %arg_mag
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %arg_mag_gep = getelementptr float, float addrspace(1)* %arg_mag, i32 %tid
+ %mag = load float, float addrspace(1)* %arg_mag_gep
%mag.trunc = fptrunc float %mag to half
- %sign = load half, half addrspace(1)* %arg_sign
+ %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid
+ %sign = load half, half addrspace(1)* %arg_sign_gep
%out = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
store half %out, half addrspace(1)* %arg_out
ret void
diff --git a/test/CodeGen/AMDGPU/fdiv.f16.ll b/test/CodeGen/AMDGPU/fdiv.f16.ll
index 7f84e973c958..333143393cb4 100644
--- a/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -27,7 +27,7 @@
; VI-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]]
; VI-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]]
-; VI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[RCP_RHS]], [[CVT_LHS]]
+; VI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_LHS]], [[RCP_RHS]]
; VI: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]]
; VI: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -165,7 +165,7 @@ entry:
; VI: flat_load_ushort [[RHS:v[0-9]+]]
; VI: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
-; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]]
+; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
@@ -187,7 +187,7 @@ entry:
; VI: flat_load_ushort [[RHS:v[0-9]+]]
; VI: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
-; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]]
+; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 {
diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll
index 738a5adba14f..bc489454341a 100644
--- a/test/CodeGen/AMDGPU/fdiv.ll
+++ b/test/CodeGen/AMDGPU/fdiv.ll
@@ -20,7 +20,7 @@
; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
-; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]]
+; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
@@ -45,7 +45,7 @@ entry:
; GCN-NOT: s_setreg
; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
-; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]]
+; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
@@ -85,20 +85,11 @@ entry:
}
; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32:
-; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
-; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
-; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
-
-; GCN-NOT: s_setreg
-; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
-; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
-; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]]
-; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
-; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
-; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; GCN-NOT: [[RESULT]]
; GCN-NOT: s_setreg
-; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
-; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
+; GCN: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
entry:
%fdiv = fdiv fast float %a, %b
@@ -121,6 +112,21 @@ entry:
ret void
}
+; FUNC-LABEL: {{^}}fdiv_ulp25_f32_fast_math:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
+
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @fdiv_ulp25_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
+entry:
+ %fdiv = fdiv fast float %a, %b, !fpmath !0
+ store float %fdiv, float addrspace(1)* %out
+ ret void
+}
+
; FUNC-LABEL: {{^}}fdiv_f32_arcp_math:
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
@@ -154,8 +160,9 @@ entry:
}
; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
-; GCN: v_cmp_gt_f32
-; GCN: v_cmp_gt_f32
+; GCN: v_rcp_f32
+; GCN: v_rcp_f32
+; GCN-NOT: v_cmp_gt_f32
define amdgpu_kernel void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
entry:
%fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
diff --git a/test/CodeGen/AMDGPU/fma-combine.ll b/test/CodeGen/AMDGPU/fma-combine.ll
index 4113ba8dc1f0..7526d08bdbe5 100644
--- a/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/test/CodeGen/AMDGPU/fma-combine.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
; beneficial even without fp32 denormals, but they do require no-infs-fp-math
@@ -387,7 +387,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace
; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y:
; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
@@ -403,7 +403,7 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one:
; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
@@ -419,7 +419,7 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y:
; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
@@ -435,7 +435,7 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone:
; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
@@ -451,7 +451,7 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y:
; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
@@ -467,7 +467,7 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x:
; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
@@ -483,7 +483,7 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y:
; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
@@ -499,7 +499,7 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x:
; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
@@ -515,7 +515,7 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y:
; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
@@ -531,7 +531,7 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one:
; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
@@ -547,7 +547,7 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y:
; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
@@ -563,7 +563,7 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone:
; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
;
; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
@@ -583,8 +583,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f32_interp:
; SI-NOFMA: v_sub_f32_e32 [[VT1:v[0-9]]], 1.0, [[VT:v[0-9]]]
-; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VT1]], [[VY:v[0-9]]]
-; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VT]], [[VX:v[0-9]]]
+; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VY:v[0-9]]], [[VT1]]
+; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VX:v[0-9]]], [[VT]]
;
; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]]
diff --git a/test/CodeGen/AMDGPU/fma.f64.ll b/test/CodeGen/AMDGPU/fma.f64.ll
index 4d3f3712621e..907121f1cd46 100644
--- a/test/CodeGen/AMDGPU/fma.f64.ll
+++ b/test/CodeGen/AMDGPU/fma.f64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
declare double @llvm.fma.f64(double, double, double) nounwind readnone
declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/fma.ll b/test/CodeGen/AMDGPU/fma.ll
index 659cecb59ebf..6be4c450a51e 100644
--- a/test/CodeGen/AMDGPU/fma.ll
+++ b/test/CodeGen/AMDGPU/fma.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare float @llvm.fma.f32(float, float, float) nounwind readnone
declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/fmax_legacy.ll b/test/CodeGen/AMDGPU/fmax_legacy.ll
index 7643c3ea533c..44c80b63bf7c 100644
--- a/test/CodeGen/AMDGPU/fmax_legacy.ll
+++ b/test/CodeGen/AMDGPU/fmax_legacy.ll
@@ -10,7 +10,7 @@ declare i32 @llvm.r600.read.tidig.x() #1
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; EG: MAX
define amdgpu_kernel void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
@@ -31,7 +31,7 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, fl
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
-; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; EG: MAX
define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -51,7 +51,7 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, fl
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; EG: MAX
define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -71,7 +71,7 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, fl
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
-; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; EG: MAX
define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -91,7 +91,7 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, fl
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
-; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; EG: MAX
define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
diff --git a/test/CodeGen/AMDGPU/fmed3.ll b/test/CodeGen/AMDGPU/fmed3.ll
index 27d9261b1fab..4cfc9fc80fb0 100644
--- a/test/CodeGen/AMDGPU/fmed3.ll
+++ b/test/CodeGen/AMDGPU/fmed3.ll
@@ -872,8 +872,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(fl
; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
-; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[B]], [[A]]
-; GCN: v_min_f32_e32 v{{[0-9]+}}, [[C]], [[MAX]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[A]], [[B]]
+; GCN: v_min_f32_e32 v{{[0-9]+}}, [[MAX]], [[C]]
define amdgpu_kernel void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
diff --git a/test/CodeGen/AMDGPU/fmin_legacy.ll b/test/CodeGen/AMDGPU/fmin_legacy.ll
index 52336f95a909..0494295fc15f 100644
--- a/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -45,7 +45,7 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out,
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -64,7 +64,7 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, fl
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
-; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -83,7 +83,7 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, fl
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
-; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -102,7 +102,7 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, fl
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -121,7 +121,7 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, fl
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid
diff --git a/test/CodeGen/AMDGPU/fmul.f16.ll b/test/CodeGen/AMDGPU/fmul.f16.ll
index cd86409e2038..5f120f63d7fe 100644
--- a/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -1,14 +1,14 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}fmul_f16
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
+; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
+; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fmul_f16(
@@ -70,16 +70,16 @@ entry:
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
@@ -108,7 +108,7 @@ entry:
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fmul_v2f16_imm_a(
@@ -134,7 +134,7 @@ entry:
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fmul_v2f16_imm_b(
diff --git a/test/CodeGen/AMDGPU/fmul64.ll b/test/CodeGen/AMDGPU/fmul64.ll
index f14233f267b2..d37d432842f3 100644
--- a/test/CodeGen/AMDGPU/fmul64.ll
+++ b/test/CodeGen/AMDGPU/fmul64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
; FUNC-LABEL: {{^}}fmul_f64:
; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
diff --git a/test/CodeGen/AMDGPU/fmuladd.f16.ll b/test/CodeGen/AMDGPU/fmuladd.f16.ll
index 9b713419e747..980d68ceded8 100644
--- a/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -79,7 +79,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half add
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
@@ -108,7 +108,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out,
@@ -227,8 +227,8 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
-; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
-; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
+; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
@@ -257,8 +257,8 @@ define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
-; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
-; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
+; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
@@ -287,7 +287,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
-; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
+; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -319,7 +319,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
-; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
+; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -347,13 +347,13 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocap
; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
-; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGB]], [[REGA]]
+; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
-; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
-; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
+; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
@@ -385,7 +385,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
; VI-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
-; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
+; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
@@ -416,7 +416,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
@@ -444,7 +444,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half add
; VI-DENORM-CONTRACT: v_fma_f16 [[R2]], [[R1]], 2.0, -[[R2]]
; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
diff --git a/test/CodeGen/AMDGPU/fmuladd.f32.ll b/test/CodeGen/AMDGPU/fmuladd.f32.ll
index e42255026692..4b1e41ff91e1 100644
--- a/test/CodeGen/AMDGPU/fmuladd.f32.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.f32.ll
@@ -1,12 +1,12 @@
-; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s
-; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s
-; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s
-; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s
-; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s
-; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
-; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s
-; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow.
@@ -67,7 +67,7 @@ define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspa
; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; SI-DENORM buffer_store_dword [[RESULT]]
; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -96,7 +96,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float a
; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -125,10 +125,10 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float a
; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -160,10 +160,10 @@ define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out,
; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -192,7 +192,7 @@ define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out,
; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -221,7 +221,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, flo
; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]]
; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -252,7 +252,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out,
; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]]
; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -282,7 +282,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, flo
; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -310,11 +310,11 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, flo
; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
-; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
-; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
+; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
; SI: buffer_store_dword [[RESULT]]
; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -345,11 +345,11 @@ define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %ou
; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
-; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
-; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
+; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
; SI: buffer_store_dword [[RESULT]]
; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -379,10 +379,10 @@ define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture
; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
-; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
-; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
+; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
; SI: buffer_store_dword [[RESULT]]
@@ -414,10 +414,10 @@ define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocaptur
; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
-; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
-; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
+; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
; SI: buffer_store_dword [[RESULT]]
@@ -446,17 +446,17 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias noca
; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
-; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGB]], [[REGA]]
+; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGA]], [[REGB]]
; SI-FLUSH: buffer_store_dword [[REGC]]
; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
-; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
-; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
+; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -489,10 +489,10 @@ define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture
; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
-; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
-; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
+; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
; SI: buffer_store_dword [[RESULT]]
; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -525,10 +525,10 @@ define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocaptur
; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -556,10 +556,10 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float a
; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; SI: buffer_store_dword [[RESULT]]
; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
diff --git a/test/CodeGen/AMDGPU/fmuladd.f64.ll b/test/CodeGen/AMDGPU/fmuladd.f64.ll
index 86e91e04b0fc..8d91a56ee421 100644
--- a/test/CodeGen/AMDGPU/fmuladd.f64.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.f64.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s
-; RUN: llc -march=amdgcn -mcpu=verde -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
-; RUN: llc -march=amdgcn -mcpu=verde -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s
; GCN-LABEL: {{^}}fmuladd_f64:
; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
diff --git a/test/CodeGen/AMDGPU/fmuladd.v2f16.ll b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
index 624610096cbc..b50a26c023ca 100644
--- a/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
@@ -1,12 +1,12 @@
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
-
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
+
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
diff --git a/test/CodeGen/AMDGPU/fneg-combines.ll b/test/CodeGen/AMDGPU/fneg-combines.ll
index 66bf9d0ffb00..002bc47fb96a 100644
--- a/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -9,7 +9,7 @@
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
@@ -31,7 +31,7 @@ define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrsp
; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
; GCN-NEXT: buffer_store_dword [[ADD]]
@@ -54,7 +54,7 @@ define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
@@ -82,10 +82,10 @@ define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-SAFE: v_subrev_f32_e32
+; GCN-SAFE: v_sub_f32_e32
; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000,
-; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -106,10 +106,10 @@ define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-SAFE: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
-; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -133,7 +133,7 @@ define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float
; GCN-SAFE: v_sub_f32_e64 [[ADD:v[0-9]+]], -[[A]], [[B]]
; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
-; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -157,11 +157,11 @@ define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, fl
; GCN-SAFE: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1{{$}}
; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], [[A]], [[SIGNBIT]]
-; GCN-SAFE: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], [[ADD]], [[SIGNBIT]]
; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
-; GCN-NSZ-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
@@ -185,10 +185,10 @@ define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
-; GCN-SAFE-DAG: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GCN-SAFE-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
-; GCN-NSZ-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
@@ -235,7 +235,7 @@ define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrsp
; GCN-LABEL: {{^}}v_fneg_mul_store_use_mul_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
; GCN: buffer_store_dword [[ADD]]
@@ -280,7 +280,7 @@ define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out
; GCN-LABEL: {{^}}v_fneg_mul_fneg_x_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-NEXT: buffer_store_dword [[ADD]]
define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -300,7 +300,7 @@ define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float
; GCN-LABEL: {{^}}v_fneg_mul_x_fneg_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-NEXT: buffer_store_dword [[ADD]]
define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -342,7 +342,7 @@ define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, fl
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
-; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]]
; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
; GCN: buffer_store_dword [[NEG_A]]
define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
@@ -364,7 +364,7 @@ define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %
; GCN-LABEL: {{^}}v_fneg_mul_multi_use_fneg_x_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]]
; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
; GCN: buffer_store_dword [[MUL]]
@@ -974,7 +974,7 @@ define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)*
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
-; GCN-SAFE: v_mac_f32_e32 [[C]], [[B]], [[A]]
+; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]]
; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[C]]
; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
@@ -1000,7 +1000,7 @@ define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrs
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
-; GCN-SAFE: v_mac_f32_e32 [[C]], [[B]], [[A]]
+; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]]
; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]]
; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
@@ -1449,7 +1449,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float
; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_mul_legacy_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
; GCN: buffer_store_dword [[ADD]]
@@ -1494,7 +1494,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addr
; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_x_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-NEXT: buffer_store_dword [[ADD]]
define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1514,7 +1514,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out
; GCN-LABEL: {{^}}v_fneg_mul_legacy_x_fneg_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-NEXT: buffer_store_dword [[ADD]]
define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1556,7 +1556,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
-; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]]
; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
; GCN: buffer_store_dword [[NEG_A]]
define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
@@ -1578,7 +1578,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspac
; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_fneg_x_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]]
; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
; GCN: buffer_store_dword [[MUL]]
@@ -1664,7 +1664,7 @@ define amdgpu_kernel void @v_fneg_trunc_f32(float addrspace(1)* %out, float addr
; GCN-LABEL: {{^}}v_fneg_round_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: v_trunc_f32_e32
-; GCN: v_subrev_f32_e32
+; GCN: v_sub_f32_e32
; GCN: v_cndmask_b32
; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
@@ -1782,11 +1782,11 @@ define amdgpu_kernel void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
-; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[B]], [[A]]
+; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]]
; GCN: s_cbranch_scc1
; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]]
-; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[XOR]]
+; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[XOR]], [[C]]
; GCN: buffer_store_dword [[MUL1]]
; GCN: buffer_store_dword [[MUL0]]
@@ -1851,7 +1851,7 @@ define amdgpu_kernel void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float
; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]]
; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]]
; GCN: ; use [[NEG]]
; GCN: buffer_store_dword [[MUL]]
@@ -1984,8 +1984,8 @@ define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)*
; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]]
; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0
-; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[FMA0]]
-; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[D]], [[FMA0]]
+; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[FMA0]], [[C]]
+; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[FMA0]], [[D]]
; GCN: buffer_store_dword [[MUL1]]
; GCN-NEXT: buffer_store_dword [[MUL2]]
@@ -2084,7 +2084,7 @@ define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(float addrspace(1)*
; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
-; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[D]], [[TRUNC_A]]
+; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[TRUNC_A]], [[D]]
; GCN: buffer_store_dword [[FMA0]]
; GCN: buffer_store_dword [[MUL1]]
define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index f4afaca2b7a7..56aea641d16e 100644
--- a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -5,7 +5,7 @@
; GCN-LABEL: {{^}}fneg_fabs_fadd_f16:
; CI: v_cvt_f32_f16_e32
; CI: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |v{{[0-9]+}}|
-; CI: v_subrev_f32_e32 v{{[0-9]+}}, [[CVT_ABS_X]], v{{[0-9]+}}
+; CI: v_sub_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_ABS_X]]
; GFX89-NOT: _and
; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|
@@ -20,7 +20,7 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x,
; GCN-LABEL: {{^}}fneg_fabs_fmul_f16:
; CI-DAG: v_cvt_f32_f16_e32
; CI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG_ABS_X:v[0-9]+]], -|{{v[0-9]+}}|
-; CI: v_mul_f32_e32 {{v[0-9]+}}, [[CVT_NEG_ABS_X]], {{v[0-9]+}}
+; CI: v_mul_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, [[CVT_NEG_ABS_X]]
; CI: v_cvt_f16_f32_e32
; GFX89-NOT: _and
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.ll b/test/CodeGen/AMDGPU/fneg-fabs.ll
index 0a7346f410c9..3f20ca73e922 100644
--- a/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
; SI-NOT: and
diff --git a/test/CodeGen/AMDGPU/fneg.f16.ll b/test/CodeGen/AMDGPU/fneg.f16.ll
index 2d94726cbe20..49d674252746 100644
--- a/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s
; FIXME: Should be able to do scalar op
; GCN-LABEL: {{^}}s_fneg_f16:
@@ -46,7 +46,7 @@ define amdgpu_kernel void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 {
; CI-DAG: v_cvt_f32_f16_e32 [[CVT_VAL:v[0-9]+]], [[NEG_VALUE]]
; CI-DAG: v_cvt_f32_f16_e64 [[NEG_CVT0:v[0-9]+]], -[[NEG_VALUE]]
-; CI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_VAL]], [[NEG_CVT0]]
+; CI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[NEG_CVT0]], [[CVT_VAL]]
; CI: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], [[MUL]]
; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]]
diff --git a/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir b/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir
index 986c6b296c96..3155b7a8664f 100644
--- a/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir
+++ b/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir
@@ -1,26 +1,5 @@
# RUN: llc -march=amdgcn -run-pass peephole-opt -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
-
---- |
- define amdgpu_kernel void @no_fold_imm_madak_mac_clamp_f32() #0 {
- ret void
- }
-
- define amdgpu_kernel void @no_fold_imm_madak_mac_omod_f32() #0 {
- ret void
- }
-
- define amdgpu_kernel void @no_fold_imm_madak_mad_clamp_f32() #0 {
- ret void
- }
-
- define amdgpu_kernel void @no_fold_imm_madak_mad_omod_f32() #0 {
- ret void
- }
-
- attributes #0 = { nounwind }
-
...
----
# GCN-LABEL: name: no_fold_imm_madak_mac_clamp_f32
# GCN: %23 = V_MOV_B32_e32 1090519040, implicit %exec
# GCN-NEXT: %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit %exec
@@ -62,14 +41,14 @@ liveins:
- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
- { reg: '%vgpr0', virtual-reg: '%3' }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0
+ %6 = S_LOAD_DWORDX2_IMM %0, 13, 0
%27 = V_ASHRREV_I32_e32 31, %3, implicit %exec
%28 = REG_SEQUENCE %3, 1, %27, 2
%11 = S_MOV_B32 61440
@@ -133,14 +112,14 @@ liveins:
- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
- { reg: '%vgpr0', virtual-reg: '%3' }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0
+ %6 = S_LOAD_DWORDX2_IMM %0, 13, 0
%27 = V_ASHRREV_I32_e32 31, %3, implicit %exec
%28 = REG_SEQUENCE %3, 1, %27, 2
%11 = S_MOV_B32 61440
@@ -204,14 +183,14 @@ liveins:
- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
- { reg: '%vgpr0', virtual-reg: '%3' }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0
+ %6 = S_LOAD_DWORDX2_IMM %0, 13, 0
%27 = V_ASHRREV_I32_e32 31, %3, implicit %exec
%28 = REG_SEQUENCE %3, 1, %27, 2
%11 = S_MOV_B32 61440
@@ -275,14 +254,14 @@ liveins:
- { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
- { reg: '%vgpr0', virtual-reg: '%3' }
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0
+ %6 = S_LOAD_DWORDX2_IMM %0, 13, 0
%27 = V_ASHRREV_I32_e32 31, %3, implicit %exec
%28 = REG_SEQUENCE %3, 1, %27, 2
%11 = S_MOV_B32 61440
diff --git a/test/CodeGen/AMDGPU/fold-operands-order.mir b/test/CodeGen/AMDGPU/fold-operands-order.mir
index afde89d6b64b..51bb357fcf6e 100644
--- a/test/CodeGen/AMDGPU/fold-operands-order.mir
+++ b/test/CodeGen/AMDGPU/fold-operands-order.mir
@@ -1,10 +1,4 @@
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands -o - %s | FileCheck -check-prefix=GCN %s
-
---- |
- define amdgpu_kernel void @mov_in_use_list_2x() {
- unreachable
- }
-
...
---
diff --git a/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
index 2c6b1cb18f7e..579a1454dd9a 100644
--- a/test/CodeGen/AMDGPU/fp32_to_fp16.ll
+++ b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/fpext.f16.ll b/test/CodeGen/AMDGPU/fpext.f16.ll
index 15cc73b9ee53..ec19fd199b4e 100644
--- a/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
; GCN-LABEL: {{^}}fpext_f16_to_f32
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
@@ -154,7 +154,7 @@ entry:
; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
; GCN-DAG: v_cvt_f32_f16_e64 [[CVTA_NEG:v[0-9]+]], -[[A]]
; SI-DAG: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]]
-; SI: v_mul_f32_e32 [[MUL_F32:v[0-9]+]], [[CVTA]], [[CVTA_NEG]]
+; SI: v_mul_f32_e32 [[MUL_F32:v[0-9]+]], [[CVTA_NEG]], [[CVTA]]
; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]]
; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT_NEGA:v[0-9]+]], -[[A]]
diff --git a/test/CodeGen/AMDGPU/fptosi.f16.ll b/test/CodeGen/AMDGPU/fptosi.f16.ll
index f310618d8bdb..f593030764a9 100644
--- a/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}fptosi_f16_to_i16
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
@@ -60,7 +60,7 @@ entry:
; SI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
; SI: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]]
; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
-; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]]
+; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_LO]], v[[R_I16_HI]]
; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
diff --git a/test/CodeGen/AMDGPU/fptoui.f16.ll b/test/CodeGen/AMDGPU/fptoui.f16.ll
index 7641c08e33c3..cebe3304d542 100644
--- a/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}fptoui_f16_to_i16
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
@@ -60,7 +60,7 @@ entry:
; SI: v_cvt_u32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
; SI: v_cvt_u32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
-; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]]
+; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_0]], v[[R_I16_HI]]
; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
diff --git a/test/CodeGen/AMDGPU/fptrunc.f16.ll b/test/CodeGen/AMDGPU/fptrunc.f16.ll
index bc72f4424c98..64df625d4bb5 100644
--- a/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
; GCN-LABEL: {{^}}fptrunc_f32_to_f16:
; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
@@ -38,10 +38,10 @@ entry:
; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
@@ -68,7 +68,7 @@ entry:
; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
diff --git a/test/CodeGen/AMDGPU/fract.f64.ll b/test/CodeGen/AMDGPU/fract.f64.ll
index 9a56cbe983cd..1314dfe3c7ca 100644
--- a/test/CodeGen/AMDGPU/fract.f64.ll
+++ b/test/CodeGen/AMDGPU/fract.f64.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=VI-UNSAFE -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=VI-UNSAFE -check-prefix=FUNC %s
declare double @llvm.fabs.f64(double) #0
declare double @llvm.floor.f64(double) #0
diff --git a/test/CodeGen/AMDGPU/fract.ll b/test/CodeGen/AMDGPU/fract.ll
index 207fe280c9a6..2217f67da7d3 100644
--- a/test/CodeGen/AMDGPU/fract.ll
+++ b/test/CodeGen/AMDGPU/fract.ll
@@ -1,15 +1,15 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
declare float @llvm.fabs.f32(float) #0
declare float @llvm.floor.f32(float) #0
; GCN-LABEL: {{^}}fract_f32:
; GCN-SAFE: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]]
-; GCN-SAFE: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]]
+; GCN-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[INPUT]], [[FLR]]
; GCN-UNSAFE: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/frem.ll b/test/CodeGen/AMDGPU/frem.ll
index 9778069d0477..3b8f58cc18a7 100644
--- a/test/CodeGen/AMDGPU/frem.ll
+++ b/test/CodeGen/AMDGPU/frem.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}frem_f32:
; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}}
@@ -29,7 +29,7 @@ define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)
; GCN: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
; GCN: buffer_load_dword [[X:v[0-9]+]], {{.*}}
; GCN: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]]
-; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]]
+; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[X]], [[INVY]]
; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]]
; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]]
; GCN: buffer_store_dword [[RESULT]]
diff --git a/test/CodeGen/AMDGPU/fsqrt.f64.ll b/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 453d8fb37f2f..186757e4c5d8 100644
--- a/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}v_safe_fsqrt_f64:
; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
diff --git a/test/CodeGen/AMDGPU/fsqrt.ll b/test/CodeGen/AMDGPU/fsqrt.ll
index a0fd3411ca05..6bd9a0db14f6 100644
--- a/test/CodeGen/AMDGPU/fsqrt.ll
+++ b/test/CodeGen/AMDGPU/fsqrt.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x)
diff --git a/test/CodeGen/AMDGPU/fsub.f16.ll b/test/CodeGen/AMDGPU/fsub.f16.ll
index fa00c06546db..15a4ce2d88f7 100644
--- a/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -1,15 +1,15 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
; GCN-LABEL: {{^}}fsub_f16:
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
+; SI: v_sub_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89: v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
+; GFX89: v_sub_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fsub_f16(
@@ -70,16 +70,16 @@ entry:
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-; VI-DAG: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
@@ -109,12 +109,12 @@ entry:
; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST2]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40003c00
; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] neg_lo:[1,0] neg_hi:[1,0]
@@ -143,12 +143,12 @@ entry:
; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00
; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONSTM1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]]
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00c000
; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]{{$}}
diff --git a/test/CodeGen/AMDGPU/fsub.ll b/test/CodeGen/AMDGPU/fsub.ll
index e7a92d95d485..48647a2cdb89 100644
--- a/test/CodeGen/AMDGPU/fsub.ll
+++ b/test/CodeGen/AMDGPU/fsub.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}v_fsub_f32:
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
%b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
%a = load float, float addrspace(1)* %in, align 4
@@ -41,10 +41,10 @@ define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float
; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
%a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
@@ -67,7 +67,7 @@ define amdgpu_kernel void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x flo
}
; FUNC-LABEL: {{^}}v_fneg_fsub_f32:
-; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
define amdgpu_kernel void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
%b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
@@ -80,7 +80,7 @@ define amdgpu_kernel void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrs
}
; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_f32:
-; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
; SI-NOT: xor
define amdgpu_kernel void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
%b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
@@ -93,7 +93,7 @@ define amdgpu_kernel void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float a
}
; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_attribute_f32:
-; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
; SI-NOT: xor
define amdgpu_kernel void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
@@ -109,7 +109,7 @@ define amdgpu_kernel void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %ou
; make sure it is disabled and the fneg is not folded if it is not
; "true".
; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_false_attribute_f32:
-; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
define amdgpu_kernel void @v_fneg_fsub_nsz_false_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
%b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/AMDGPU/fsub64.ll b/test/CodeGen/AMDGPU/fsub64.ll
index dc332414a152..73f1a69eeb9d 100644
--- a/test/CodeGen/AMDGPU/fsub64.ll
+++ b/test/CodeGen/AMDGPU/fsub64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
declare double @llvm.fabs.f64(double) #0
diff --git a/test/CodeGen/AMDGPU/ftrunc.f64.ll b/test/CodeGen/AMDGPU/ftrunc.f64.ll
index 1f72ec65588e..bb2a6ba8e348 100644
--- a/test/CodeGen/AMDGPU/ftrunc.f64.ll
+++ b/test/CodeGen/AMDGPU/ftrunc.f64.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
declare double @llvm.trunc.f64(double) nounwind readnone
declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/global-extload-i16.ll b/test/CodeGen/AMDGPU/global-extload-i16.ll
index 19e592f50bea..4e50f995d27e 100644
--- a/test/CodeGen/AMDGPU/global-extload-i16.ll
+++ b/test/CodeGen/AMDGPU/global-extload-i16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FIXME: cypress is broken because the bigger testcases spill and it's not implemented
diff --git a/test/CodeGen/AMDGPU/global-smrd-unknown.ll b/test/CodeGen/AMDGPU/global-smrd-unknown.ll
new file mode 100644
index 000000000000..8a576e6480a1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-smrd-unknown.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -memdep-block-scan-limit=1 -amdgpu-scalarize-global-loads -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}unknown_memdep_analysis:
+; GCN: flat_load_dword
+; GCN: flat_load_dword
+; GCN: flat_store_dword
+define amdgpu_kernel void @unknown_memdep_analysis(float addrspace(1)* nocapture readonly %arg) #0 {
+bb:
+ %tmp53 = load float, float addrspace(1)* undef, align 4
+ %tmp54 = getelementptr inbounds float, float addrspace(1)* %arg, i32 31
+ %tmp55 = load float, float addrspace(1)* %tmp54, align 4
+ %tmp56 = tail call float @llvm.fmuladd.f32(float undef, float %tmp53, float %tmp55)
+ store float %tmp56, float addrspace(1)* undef, align 4
+ ret void
+}
+
+declare float @llvm.fmuladd.f32(float, float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/half.ll b/test/CodeGen/AMDGPU/half.ll
index 41ae5a4a0b00..43745d4b3da3 100644
--- a/test/CodeGen/AMDGPU/half.ll
+++ b/test/CodeGen/AMDGPU/half.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; half args should be promoted to float for SI and lower.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]]
-; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[HI]], [[V0]]
+; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]]
; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN: s_endpgm
define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
@@ -471,10 +471,10 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out,
; SI-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]]
; SI-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]]
-; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]]
+; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]]
; VI-DAG: v_cvt_f16_f32_sdwa [[CVT1:v[0-9]+]], v[[HI]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT1]], [[CVT0]]
+; VI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[CVT1]]
; GCN-DAG: buffer_store_dword [[PACKED]]
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/imm.ll b/test/CodeGen/AMDGPU/imm.ll
index c2668a077b09..8cda01a10f76 100644
--- a/test/CodeGen/AMDGPU/imm.ll
+++ b/test/CodeGen/AMDGPU/imm.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; Use a 64-bit value with lo bits that can be represented as an inline constant
; GCN-LABEL: {{^}}i64_imm_inline_lo:
diff --git a/test/CodeGen/AMDGPU/immv216.ll b/test/CodeGen/AMDGPU/immv216.ll
index cd3502baee7b..fe86a5872968 100644
--- a/test/CodeGen/AMDGPU/immv216.ll
+++ b/test/CodeGen/AMDGPU/immv216.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
; FIXME: Merge into imm.ll
; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
@@ -305,7 +305,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace
; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
; VI-DAG: buffer_load_dword
; VI-NOT: and
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; VI: buffer_store_dword
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 0d20c32a4770..62200b988bea 100644
--- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
; Tests for indirect addressing on SI, which is implemented using dynamic
; indexing of vectors.
diff --git a/test/CodeGen/AMDGPU/inline-asm.ll b/test/CodeGen/AMDGPU/inline-asm.ll
index c0f5218efc16..75826d530cb0 100644
--- a/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/test/CodeGen/AMDGPU/inline-asm.ll
@@ -222,9 +222,9 @@ entry:
; FIXME: Should be scheduled to shrink vcc
; CHECK-LABEL: {{^}}i1_input_phys_vgpr_x2:
; CHECK: v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK: v_cmp_eq_u32_e64 s[0:1], 1, v1
; CHECK: v_cndmask_b32_e64 v0, 0, -1, vcc
-; CHECK: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; CHECK: v_cmp_eq_u32_e32 vcc, 1, v1
+; CHECK: v_cndmask_b32_e64 v1, 0, -1, vcc
define amdgpu_kernel void @i1_input_phys_vgpr_x2() {
entry:
%val0 = load volatile i1, i1 addrspace(1)* undef
diff --git a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
index 5cd965d2fa9c..eea26192ed32 100644
--- a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
+++ b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; GatherAllAliases gives up on trying to analyze cases where the
; pointer may have been loaded from an aliased store, so make sure
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
index f08d4b6c7915..06dc2cc8b90e 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.fabs.f16(half %a)
declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
index 1fcdac537fba..f71b9752e9a1 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
declare i1 @llvm.amdgcn.class.f32(float, i32) #1
declare i1 @llvm.amdgcn.class.f64(double, i32) #1
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index 2cc63ae74bf1..1b3e09a81e5a 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s
; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=VI %s
; FIXME: Enable for VI.
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
index fe211d356070..7068f4559055 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.amdgcn.ldexp.f16(half %a, i32 %b)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
index 593c95856811..871b8c4f99b9 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}bfe_i32_arg_arg_arg:
; GCN: v_bfe_i32
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
index 495e36b09f8f..39370e41e8aa 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
declare i32 @llvm.amdgcn.sffbh.i32(i32) #1
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
index e0cec2134e70..8468aa3a7b3e 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
declare double @llvm.amdgcn.trig.preop.f64(double, i32) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
index 92e3a1099da0..68fd08f778c4 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}bfe_u32_arg_arg_arg:
; GCN: v_bfe_u32
diff --git a/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
index 0604a49372a2..071f2a6de4cd 100644
--- a/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.ceil.f16(half %a)
declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a)
@@ -33,12 +33,12 @@ entry:
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_ceil_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_ceil_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NOT: and
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index d836ea36ef63..8931de63e74b 100644
--- a/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.cos.f16(half %a)
declare <2 x half> @llvm.cos.v2f16(<2 x half> %a)
@@ -29,8 +29,8 @@ entry:
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]]
-; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]]
+; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]]
+; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]]
; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -48,8 +48,8 @@ entry:
; GCN-NOT: and
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @cos_v2f16(
diff --git a/test/CodeGen/AMDGPU/llvm.exp2.f16.ll b/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
index 5757142b9e95..4e96a7619716 100644
--- a/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.exp2.f16(half %a)
declare <2 x half> @llvm.exp2.v2f16(<2 x half> %a)
@@ -33,12 +33,12 @@ entry:
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_exp_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_exp_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NOT: and
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index 6a18141d8035..74d1e694ffbe 100644
--- a/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.floor.f16(half %a)
declare <2 x half> @llvm.floor.v2f16(<2 x half> %a)
@@ -33,12 +33,12 @@ entry:
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_floor_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_floor_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NOT: and
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/test/CodeGen/AMDGPU/llvm.fma.f16.ll
index 3f4fba7d8ead..a379b18ffb8b 100644
--- a/test/CodeGen/AMDGPU/llvm.fma.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.fma.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.fma.f16(half %a, half %b, half %c)
declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
@@ -128,7 +128,7 @@ define amdgpu_kernel void @fma_f16_imm_c(
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_v2f16(
@@ -167,7 +167,7 @@ define amdgpu_kernel void @fma_v2f16(
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_v2f16_imm_a(
@@ -210,7 +210,7 @@ define amdgpu_kernel void @fma_v2f16_imm_a(
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_v2f16_imm_b(
@@ -253,7 +253,7 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]]
; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_v2f16_imm_c(
diff --git a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index 806723e5136c..2d4fe08d8bde 100644
--- a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
-; RUN: llc -march=amdgcn -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
declare half @llvm.fmuladd.f16(half %a, half %b, half %c)
declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
@@ -13,11 +13,11 @@ declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half>
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
-; SI: v_mac_f32_e32 v[[C_F32]], v[[B_F32]], v[[A_F32]]
+; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]]
; SI: buffer_store_short v[[R_F16]]
-; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], v[[B_F16]], v[[A_F16]]
+; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]]
; VI-FLUSH: buffer_store_short v[[C_F16]]
; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
@@ -110,19 +110,19 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
-; SI: v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]]
-; SI: v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]]
+; SI: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
+; SI: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
; VI-FLUSH: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[B_V2_F16]], v[[C_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[C_V2_F16]], v[[B_V2_F16]]
+; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]]
; VI-FLUSH-NOT: v_and_b32
-; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[A_V2_F16]]
+; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[R_F16_HI]]
; VI-DENORM: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; VI-DENORM: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
@@ -131,7 +131,7 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
; VI-DENORM-NOT: v_and_b32
-; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[RES0]]
+; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/llvm.log2.f16.ll b/test/CodeGen/AMDGPU/llvm.log2.f16.ll
index 773eb55283e4..277195c53208 100644
--- a/test/CodeGen/AMDGPU/llvm.log2.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.log2.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.log2.f16(half %a)
declare <2 x half> @llvm.log2.v2f16(<2 x half> %a)
@@ -33,12 +33,12 @@ entry:
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NOT: and
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 8f4b314ffabb..c72716439a76 100644
--- a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.maxnum.f16(half %a, half %b)
declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
@@ -9,9 +9,9 @@ declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
+; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
+; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @maxnum_f16(
@@ -73,18 +73,18 @@ entry:
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NOT: and
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
@@ -115,7 +115,7 @@ entry:
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @maxnum_v2f16_imm_a(
@@ -143,7 +143,7 @@ entry:
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @maxnum_v2f16_imm_b(
diff --git a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index 1a86286f7136..0e93acc27dc5 100644
--- a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.minnum.f16(half %a, half %b)
declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
@@ -9,9 +9,9 @@ declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
+; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
+; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @minnum_f16(
@@ -72,18 +72,18 @@ entry:
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
+; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NOT: and
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
@@ -116,7 +116,7 @@ entry:
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @minnum_v2f16_imm_a(
@@ -144,7 +144,7 @@ entry:
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @minnum_v2f16_imm_b(
diff --git a/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 30cb969a76e5..92282083984b 100644
--- a/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
declare half @llvm.rint.f16(half %a)
declare <2 x half> @llvm.rint.v2f16(<2 x half> %a)
@@ -34,12 +34,12 @@ entry:
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: v_and_b32
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NOT: v_and_b32
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; GFX9: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
diff --git a/test/CodeGen/AMDGPU/llvm.round.ll b/test/CodeGen/AMDGPU/llvm.round.ll
index ffe87977870b..7e29147571f2 100644
--- a/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/test/CodeGen/AMDGPU/llvm.round.ll
@@ -12,7 +12,7 @@
; GCN: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]]
; GCN: v_cmp_ge_f32_e64 vcc, |[[SUB]]|, 0.5
; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[VX]]
-; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
+; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TRUNC]], [[SEL]]
; GCN: buffer_store_dword [[RESULT]]
; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]]
@@ -70,7 +70,7 @@ define amdgpu_kernel void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x floa
; GFX89: v_sub_f16_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
; GFX89: v_cmp_ge_f16_e64 vcc, |[[SUB]]|, 0.5
; GFX89: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[COPYSIGN]]
-; GFX89: v_add_f16_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
+; GFX89: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TRUNC]], [[SEL]]
; GFX89: buffer_store_short [[RESULT]]
define amdgpu_kernel void @round_f16(half addrspace(1)* %out, i32 %x.arg) #0 {
%x.arg.trunc = trunc i32 %x.arg to i16
diff --git a/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index eb1f32c981f8..08b9d9d873b4 100644
--- a/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.sin.f16(half %a)
declare <2 x half> @llvm.sin.v2f16(<2 x half> %a)
@@ -29,9 +29,9 @@ entry:
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]]
+; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]]
; SI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
-; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]]
+; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]]
; SI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
@@ -47,10 +47,10 @@ entry:
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
index 46ee6526aca2..0e1358ecca22 100644
--- a/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.sqrt.f16(half %a)
declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
@@ -33,12 +33,12 @@ entry:
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: v_and_b32
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_sqrt_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_sqrt_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NOT: v_and_b32
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index dc7182aa0d89..37ee4e92c637 100644
--- a/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare half @llvm.trunc.f16(half %a)
declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a)
@@ -33,12 +33,12 @@ entry:
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: v_and_b32
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_trunc_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_trunc_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NOT: v_and_b32
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/load-global-f32.ll b/test/CodeGen/AMDGPU/load-global-f32.ll
index bd6fea587b42..77557a584093 100644
--- a/test/CodeGen/AMDGPU/load-global-f32.ll
+++ b/test/CodeGen/AMDGPU/load-global-f32.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}global_load_f32:
; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}
diff --git a/test/CodeGen/AMDGPU/load-global-f64.ll b/test/CodeGen/AMDGPU/load-global-f64.ll
index 5b772e1fe5ee..84214b7dbc10 100644
--- a/test/CodeGen/AMDGPU/load-global-f64.ll
+++ b/test/CodeGen/AMDGPU/load-global-f64.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}global_load_f64:
; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
diff --git a/test/CodeGen/AMDGPU/load-global-i16.ll b/test/CodeGen/AMDGPU/load-global-i16.ll
index e3415b9c47de..cb2495d5fdcf 100644
--- a/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
; FIXME: r600 is broken because the bigger testcases spill and it's not implemented
diff --git a/test/CodeGen/AMDGPU/load-global-i32.ll b/test/CodeGen/AMDGPU/load-global-i32.ll
index 5df32c1e3120..6360d39666c7 100644
--- a/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}global_load_i32:
diff --git a/test/CodeGen/AMDGPU/load-global-i64.ll b/test/CodeGen/AMDGPU/load-global-i64.ll
index de16b6c8997e..c71db0b7357c 100644
--- a/test/CodeGen/AMDGPU/load-global-i64.ll
+++ b/test/CodeGen/AMDGPU/load-global-i64.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}global_load_i64:
; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
diff --git a/test/CodeGen/AMDGPU/load-global-i8.ll b/test/CodeGen/AMDGPU/load-global-i8.ll
index fc0cbf916b52..3fe6bd26be14 100644
--- a/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}global_load_i8:
diff --git a/test/CodeGen/AMDGPU/load-weird-sizes.ll b/test/CodeGen/AMDGPU/load-weird-sizes.ll
index d6162c388b5b..f9ba6241fe06 100644
--- a/test/CodeGen/AMDGPU/load-weird-sizes.ll
+++ b/test/CodeGen/AMDGPU/load-weird-sizes.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}load_i24:
; SI: {{flat|buffer}}_load_ubyte
diff --git a/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index 74564f387ede..e1a2af6c7ef9 100644
--- a/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -1,4 +1,5 @@
; RUN: opt -S -amdgpu-lower-intrinsics %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -amdgpu-lower-intrinsics -use-wide-memcpy-loop-lowering=true %s | FileCheck -check-prefix=WOPT %s
declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1
declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1
@@ -21,6 +22,17 @@ define amdgpu_kernel void @max_size_small_static_memcpy_caller0(i8 addrspace(1)*
; OPT-NEXT: load i8
; OPT: getelementptr
; OPT-NEXT: store i8
+
+; WOPT-LABEL: @min_size_large_static_memcpy_caller0(
+; WOPT-NOT: call
+; WOPT: br label %load-store-loop
+; WOPT: [[T1:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %src, i64 %loop-index
+; WOPT-NEXT: [[T2:%[0-9]+]] = load i8, i8 addrspace(1)* [[T1]]
+; WOPT-NEXT: [[T3:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %loop-index
+; WOPT-NEXT: store i8 [[T2]], i8 addrspace(1)* [[T3]]
+; WOPT-NEXT: [[T4:%[0-9]+]] = add i64 %loop-index, 1
+; WOPT-NEXT: [[T5:%[0-9]+]] = icmp ult i64 [[T4]], 1025
+; WOPT-NEXT: br i1 [[T5]], label %load-store-loop, label %memcpy-split
define amdgpu_kernel void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false)
ret void
diff --git a/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir b/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir
new file mode 100644
index 000000000000..768acf35eeae
--- /dev/null
+++ b/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir
@@ -0,0 +1,227 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass machine-scheduler -o - %s | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: name: cluster_add_addc
+# GCN: S_NOP 0, implicit-def %vcc
+# GCN: dead %2, %3 = V_ADD_I32_e64 %0, %1, implicit %exec
+# GCN: dead %4, dead %5 = V_ADDC_U32_e64 %6, %7, %3, implicit %exec
+name: cluster_add_addc
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: sreg_64 }
+ - { id: 4, class: vgpr_32 }
+ - { id: 5, class: sreg_64 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: vgpr_32 }
+
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 0, implicit %exec
+ %1 = V_MOV_B32_e32 0, implicit %exec
+ %2, %3 = V_ADD_I32_e64 %0, %1, implicit %exec
+ %6 = V_MOV_B32_e32 0, implicit %exec
+ %7 = V_MOV_B32_e32 0, implicit %exec
+ S_NOP 0, implicit def %vcc
+ %4, %5 = V_ADDC_U32_e64 %6, %7, %3, implicit %exec
+...
+
+# GCN-LABEL: name: interleave_add64s
+# GCN: dead %8, %9 = V_ADD_I32_e64 %0, %1, implicit %exec
+# GCN-NEXT: dead %12, dead %13 = V_ADDC_U32_e64 %4, %5, %9, implicit %exec
+# GCN-NEXT: dead %10, %11 = V_ADD_I32_e64 %2, %3, implicit %exec
+# GCN-NEXT: dead %14, dead %15 = V_ADDC_U32_e64 %6, %7, %11, implicit %exec
+name: interleave_add64s
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: vgpr_32 }
+ - { id: 5, class: vgpr_32 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: vgpr_32 }
+ - { id: 8, class: vgpr_32 }
+ - { id: 9, class: sreg_64 }
+ - { id: 10, class: vgpr_32 }
+ - { id: 11, class: sreg_64 }
+ - { id: 12, class: vgpr_32 }
+ - { id: 13, class: sreg_64 }
+ - { id: 14, class: vgpr_32 }
+ - { id: 15, class: sreg_64 }
+
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 0, implicit %exec
+ %1 = V_MOV_B32_e32 0, implicit %exec
+ %2 = V_MOV_B32_e32 0, implicit %exec
+ %3 = V_MOV_B32_e32 0, implicit %exec
+ %4 = V_MOV_B32_e32 0, implicit %exec
+ %5 = V_MOV_B32_e32 0, implicit %exec
+ %6 = V_MOV_B32_e32 0, implicit %exec
+ %7 = V_MOV_B32_e32 0, implicit %exec
+
+ %8, %9 = V_ADD_I32_e64 %0, %1, implicit %exec
+ %10, %11 = V_ADD_I32_e64 %2, %3, implicit %exec
+
+
+ %12, %13 = V_ADDC_U32_e64 %4, %5, %9, implicit %exec
+ %14, %15 = V_ADDC_U32_e64 %6, %7, %11, implicit %exec
+...
+
+# GCN-LABEL: name: cluster_mov_addc
+# GCN: S_NOP 0, implicit-def %vcc
+# GCN-NEXT: %2 = S_MOV_B64 0
+# GCN-NEXT: dead %3, dead %4 = V_ADDC_U32_e64 %0, %1, %2, implicit %exec
+name: cluster_mov_addc
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: sreg_64 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: vgpr_32 }
+
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 0, implicit %exec
+ %1 = V_MOV_B32_e32 0, implicit %exec
+ %2 = S_MOV_B64 0
+ S_NOP 0, implicit def %vcc
+ %3, %4 = V_ADDC_U32_e64 %0, %1, %2, implicit %exec
+...
+
+# GCN-LABEL: name: no_cluster_add_addc_diff_sgpr
+# GCN: dead %2, dead %3 = V_ADD_I32_e64 %0, %1, implicit %exec
+# GCN-NEXT: %6 = V_MOV_B32_e32 0, implicit %exec
+# GCN-NEXT: %7 = V_MOV_B32_e32 0, implicit %exec
+# GCN-NEXT: S_NOP 0, implicit-def %vcc
+# GCN-NEXT: %8 = S_MOV_B64 0
+# GCN-NEXT: dead %4, dead %5 = V_ADDC_U32_e64 %6, %7, %8, implicit %exec
+name: no_cluster_add_addc_diff_sgpr
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: sreg_64 }
+ - { id: 4, class: vgpr_32 }
+ - { id: 5, class: sreg_64 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: vgpr_32 }
+ - { id: 8, class: sreg_64 }
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 0, implicit %exec
+ %1 = V_MOV_B32_e32 0, implicit %exec
+ %8 = S_MOV_B64 0
+ %2, %3 = V_ADD_I32_e64 %0, %1, implicit %exec
+ %6 = V_MOV_B32_e32 0, implicit %exec
+ %7 = V_MOV_B32_e32 0, implicit %exec
+ S_NOP 0, implicit def %vcc
+ %4, %5 = V_ADDC_U32_e64 %6, %7, %8, implicit %exec
+...
+# GCN-LABEL: name: cluster_sub_subb
+# GCN: S_NOP 0, implicit-def %vcc
+# GCN: dead %2, %3 = V_SUB_I32_e64 %0, %1, implicit %exec
+# GCN: dead %4, dead %5 = V_SUBB_U32_e64 %6, %7, %3, implicit %exec
+name: cluster_sub_subb
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: sreg_64 }
+ - { id: 4, class: vgpr_32 }
+ - { id: 5, class: sreg_64 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: vgpr_32 }
+
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 0, implicit %exec
+ %1 = V_MOV_B32_e32 0, implicit %exec
+ %2, %3 = V_SUB_I32_e64 %0, %1, implicit %exec
+ %6 = V_MOV_B32_e32 0, implicit %exec
+ %7 = V_MOV_B32_e32 0, implicit %exec
+ S_NOP 0, implicit def %vcc
+ %4, %5 = V_SUBB_U32_e64 %6, %7, %3, implicit %exec
+...
+
+# GCN-LABEL: name: cluster_cmp_cndmask
+# GCN: S_NOP 0, implicit-def %vcc
+# GCN-NEXT: %3 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec
+# GCN-NEXT: dead %4 = V_CNDMASK_B32_e64 %0, %1, %3, implicit %exec
+name: cluster_cmp_cndmask
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: sreg_64 }
+ - { id: 4, class: vgpr_32 }
+ - { id: 5, class: sreg_64 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: vgpr_32 }
+
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 0, implicit %exec
+ %1 = V_MOV_B32_e32 0, implicit %exec
+ %3 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec
+ S_NOP 0, implicit def %vcc
+ %4 = V_CNDMASK_B32_e64 %0, %1, %3, implicit %exec
+...
+
+# GCN-LABEL: name: cluster_multi_use_cmp_cndmask
+# GCN: %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec
+# GCN-NEXT: dead %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec
+# GCN-NEXT: dead %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec
+name: cluster_multi_use_cmp_cndmask
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64 }
+ - { id: 5, class: vgpr_32 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: vgpr_32 }
+
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 0, implicit %exec
+ %1 = V_MOV_B32_e32 0, implicit %exec
+ %2 = V_MOV_B32_e32 0, implicit %exec
+ %3 = V_MOV_B32_e32 0, implicit %exec
+
+ %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec
+ S_NOP 0, implicit def %vcc
+ %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec
+ %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec
+...
+
+# GCN-LABEL: name: cluster_multi_use_cmp_cndmask2
+# GCN: %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec
+# GCN-NEXT: dead %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec
+# GCN-NEXT: %3 = V_MOV_B32_e32 0, implicit %exec
+# GCN-NEXT: dead %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec
+name: cluster_multi_use_cmp_cndmask2
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64 }
+ - { id: 5, class: vgpr_32 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: vgpr_32 }
+
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 0, implicit %exec
+ %1 = V_MOV_B32_e32 0, implicit %exec
+ %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec
+ %2 = V_MOV_B32_e32 0, implicit %exec
+ %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec
+ %3 = V_MOV_B32_e32 0, implicit %exec
+ %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec
+...
diff --git a/test/CodeGen/AMDGPU/mad-combine.ll b/test/CodeGen/AMDGPU/mad-combine.ll
index b855fc500c6b..8a6bf853a7c6 100644
--- a/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/test/CodeGen/AMDGPU/mad-combine.ll
@@ -19,15 +19,15 @@ declare float @llvm.fmuladd.f32(float, float, float) #0
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]]
+; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
; SI-DENORM-SLOWFMAF-NOT: v_fma
; SI-DENORM-SLOWFMAF-NOT: v_mad
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; SI-STD: buffer_store_dword [[C]]
@@ -55,15 +55,15 @@ define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
-; SI-STD-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]]
-; SI-STD-DAG: v_mac_f32_e32 [[D]], [[B]], [[A]]
+; SI-STD-DAG: v_mac_f32_e32 [[C]], [[A]], [[B]]
+; SI-STD-DAG: v_mac_f32_e32 [[D]], [[A]], [[B]]
; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
-; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
+; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
@@ -99,11 +99,11 @@ define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
-; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]]
+; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
; SI-DENORM: buffer_store_dword [[RESULT]]
; SI-STD: buffer_store_dword [[C]]
@@ -133,8 +133,8 @@ define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out
; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
+; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
; SI: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
@@ -167,9 +167,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias
; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
-; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
@@ -205,8 +205,8 @@ define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* no
; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
+; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
; SI: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
@@ -238,9 +238,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias
; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
-; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
@@ -278,7 +278,7 @@ define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* no
; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
-; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
; SI: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
@@ -313,8 +313,8 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias
; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
-; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
-; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
@@ -355,9 +355,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1
; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
-; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
@@ -395,13 +395,13 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
-; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]]
+; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]]
-; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
-; SI-DENORM: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]]
+; SI-DENORM: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP1]], [[C]]
; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
@@ -437,13 +437,13 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
-; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
+; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
-; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
-; SI-DENORM: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
+; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: s_endpgm
@@ -479,21 +479,21 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
-; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[A]]
-; SI-STD-SAFE: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP0]]
+; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
+; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[A]], [[B]]
+; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP0]], [[C]]
; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], [[D]], [[E]], -[[C]]
-; SI-STD-UNSAFE: v_mac_f32_e32 [[RESULT]], [[B]], [[A]]
+; SI-STD-UNSAFE: v_mac_f32_e32 [[RESULT]], [[A]], [[B]]
-; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
-; SI-DENORM-FASTFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]]
+; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]]
-; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
-; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
-; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]]
+; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
+; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[A]], [[B]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]]
+; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[C]]
; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: s_endpgm
@@ -530,21 +530,21 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
-; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
-; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[C]], [[B]]
-; SI-STD-SAFE: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP0]], [[A]]
+; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
+; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[C]]
+; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP0]]
; SI-STD-UNSAFE: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
-; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
-; SI-DENORM-FASTFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
+; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
-; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
-; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]]
-; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
-; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
+; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[C]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]]
+; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP2]]
; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: s_endpgm
diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll
index 8e0014911def..77c35fac8b5d 100644
--- a/test/CodeGen/AMDGPU/madak.ll
+++ b/test/CodeGen/AMDGPU/madak.ll
@@ -34,8 +34,8 @@ define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float add
; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VB]], [[VA]], [[VK]]
-; GCN-DAG: v_mac_f32_e32 [[VK]], [[VC]], [[VA]]
+; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], [[VK]]
+; GCN-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]]
; GCN: s_endpgm
define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -199,7 +199,7 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalia
; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
; GCN: buffer_load_dword [[VGPR:v[0-9]+]]
; GCN: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VGPR]], [[MADAK]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]]
; GCN: buffer_store_dword [[MUL]]
define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 {
bb:
diff --git a/test/CodeGen/AMDGPU/madmk.ll b/test/CodeGen/AMDGPU/madmk.ll
index 6bc40e82459b..b78d65ae1e1a 100644
--- a/test/CodeGen/AMDGPU/madmk.ll
+++ b/test/CodeGen/AMDGPU/madmk.ll
@@ -32,8 +32,8 @@ define amdgpu_kernel void @madmk_f32(float addrspace(1)* noalias %out, float add
; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN-DAG: v_mac_f32_e32 [[VB]], [[VK]], [[VA]]
-; GCN-DAG: v_mac_f32_e32 [[VC]], [[VK]], [[VA]]
+; GCN-DAG: v_mac_f32_e32 [[VB]], [[VA]], [[VK]]
+; GCN-DAG: v_mac_f32_e32 [[VC]], [[VA]], [[VK]]
; GCN: s_endpgm
define amdgpu_kernel void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/max.ll b/test/CodeGen/AMDGPU/max.ll
index ffcdac03bc74..6387c9ff6dfa 100644
--- a/test/CodeGen/AMDGPU/max.ll
+++ b/test/CodeGen/AMDGPU/max.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}v_test_imax_sge_i32:
diff --git a/test/CodeGen/AMDGPU/merge-stores.ll b/test/CodeGen/AMDGPU/merge-stores.ll
index dfd5b97fcc86..6b0ec483247c 100644
--- a/test/CodeGen/AMDGPU/merge-stores.ll
+++ b/test/CodeGen/AMDGPU/merge-stores.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
; This test is mostly to test DAG store merging, so disable the vectorizer.
; Run with devices with different unaligned load restrictions.
diff --git a/test/CodeGen/AMDGPU/mubuf.ll b/test/CodeGen/AMDGPU/mubuf.ll
index b23b21118aaa..97666492e376 100644
--- a/test/CodeGen/AMDGPU/mubuf.ll
+++ b/test/CodeGen/AMDGPU/mubuf.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
declare i32 @llvm.amdgcn.workitem.id.x() readnone
diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll
index 57c50c9804e5..a0290789175d 100644
--- a/test/CodeGen/AMDGPU/mul.ll
+++ b/test/CodeGen/AMDGPU/mul.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
; mul24 and mad24 are affected
diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index 82c27f204a47..ba3ff0b08bc9 100644
--- a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -66,9 +66,9 @@
; FIXME: Why is this compare essentially repeated?
; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
-; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]]
; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1
+; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]]
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
; GCN: ; %Flow1
; GCN-NEXT: s_or_b64 exec, exec
diff --git a/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/test/CodeGen/AMDGPU/no-shrink-extloads.ll
index 8a7bf6db5b8d..500e4cb3cc73 100644
--- a/test/CodeGen/AMDGPU/no-shrink-extloads.ll
+++ b/test/CodeGen/AMDGPU/no-shrink-extloads.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/or.ll b/test/CodeGen/AMDGPU/or.ll
index eb082843fb82..8e6885c4fc5e 100644
--- a/test/CodeGen/AMDGPU/or.ll
+++ b/test/CodeGen/AMDGPU/or.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}or_v2i32:
diff --git a/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll b/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
index f83eb56dc6ed..776b151e3017 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) #0
declare void @llvm.invariant.end.p0i8({}*, i64, i8* nocapture) #0
diff --git a/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
index ecb513cd80b6..d8c7438e4d0d 100644
--- a/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
+++ b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}reduce_i64_load_align_4_width_to_i32:
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/regcoal-subrange-join.mir b/test/CodeGen/AMDGPU/regcoal-subrange-join.mir
new file mode 100644
index 000000000000..bac348aaed70
--- /dev/null
+++ b/test/CodeGen/AMDGPU/regcoal-subrange-join.mir
@@ -0,0 +1,162 @@
+# RUN: llc -march=amdgcn -run-pass simple-register-coalescing -o - %s | FileCheck --check-prefix=GCN %s
+#
+# See bug http://llvm.org/PR33524 for details of the problem being checked here
+# This test will provoke a subrange join (see annotations below) during simple register coalescing
+# Without a fix for PR33524 this causes an unreachable in SubRange Join
+#
+# GCN-DAG: undef %[[REG0:[0-9]+]].sub0 = COPY %sgpr5
+# GCN-DAG: undef %[[REG1:[0-9]+]].sub0 = COPY %sgpr2
+# GCN-DAG: %[[REG0]].sub1 = S_MOV_B32 1
+# GCN-DAG: %[[REG1]].sub1 = S_MOV_B32 1
+
+--- |
+ define amdgpu_vs void @regcoal-subrange-join(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 %arg6) local_unnamed_addr #0 {
+ ret void
+ }
+
+...
+---
+name: regcoal-subrange-join
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sreg_64 }
+ - { id: 1, class: vreg_128 }
+ - { id: 2, class: vreg_128 }
+ - { id: 3, class: vreg_128 }
+ - { id: 4, class: sreg_32_xm0 }
+ - { id: 5, class: sreg_32_xm0 }
+ - { id: 6, class: sreg_32_xm0, preferred-register: '%8' }
+ - { id: 7, class: vreg_128 }
+ - { id: 8, class: sreg_32_xm0, preferred-register: '%6' }
+ - { id: 9, class: vreg_128 }
+ - { id: 10, class: sgpr_32 }
+ - { id: 11, class: sgpr_32 }
+ - { id: 12, class: sgpr_32 }
+ - { id: 13, class: sgpr_32 }
+ - { id: 14, class: sgpr_32 }
+ - { id: 15, class: sgpr_32 }
+ - { id: 16, class: vgpr_32 }
+ - { id: 17, class: sreg_32_xm0 }
+ - { id: 18, class: sreg_64 }
+ - { id: 19, class: sreg_32_xm0 }
+ - { id: 20, class: sreg_32_xm0 }
+ - { id: 21, class: sreg_64 }
+ - { id: 22, class: sreg_32_xm0_xexec }
+ - { id: 23, class: sreg_32_xm0 }
+ - { id: 24, class: sreg_64_xexec }
+ - { id: 25, class: sreg_128 }
+ - { id: 26, class: sreg_64_xexec }
+ - { id: 27, class: sreg_32_xm0_xexec }
+ - { id: 28, class: sreg_32_xm0 }
+ - { id: 29, class: vgpr_32 }
+ - { id: 30, class: vgpr_32 }
+ - { id: 31, class: vgpr_32 }
+ - { id: 32, class: vgpr_32 }
+ - { id: 33, class: vgpr_32 }
+ - { id: 34, class: vgpr_32 }
+ - { id: 35, class: vgpr_32 }
+ - { id: 36, class: vgpr_32 }
+ - { id: 37, class: vgpr_32 }
+ - { id: 38, class: sreg_128 }
+ - { id: 39, class: sreg_64_xexec }
+ - { id: 40, class: sreg_32_xm0_xexec }
+ - { id: 41, class: sreg_32_xm0 }
+ - { id: 42, class: vgpr_32 }
+ - { id: 43, class: vgpr_32 }
+ - { id: 44, class: vgpr_32 }
+ - { id: 45, class: vgpr_32 }
+ - { id: 46, class: vgpr_32 }
+ - { id: 47, class: vgpr_32 }
+ - { id: 48, class: vgpr_32 }
+ - { id: 49, class: vgpr_32 }
+ - { id: 50, class: vgpr_32 }
+ - { id: 51, class: sreg_128 }
+ - { id: 52, class: vgpr_32 }
+ - { id: 53, class: vgpr_32 }
+ - { id: 54, class: vgpr_32 }
+ - { id: 55, class: vgpr_32 }
+ - { id: 56, class: vreg_128 }
+ - { id: 57, class: vreg_128 }
+ - { id: 58, class: vreg_128 }
+ - { id: 59, class: sreg_32_xm0 }
+ - { id: 60, class: sreg_32_xm0 }
+ - { id: 61, class: vreg_128 }
+liveins:
+ - { reg: '%sgpr2', virtual-reg: '%12' }
+ - { reg: '%sgpr5', virtual-reg: '%15' }
+body: |
+ bb.0:
+ liveins: %sgpr2, %sgpr5
+
+ %15 = COPY killed %sgpr5
+ %12 = COPY killed %sgpr2
+ %17 = S_MOV_B32 1
+ undef %18.sub1 = COPY %17
+ %0 = COPY %18
+ %0.sub0 = COPY killed %12
+ %21 = COPY killed %18
+ %21.sub0 = COPY killed %15
+ %22 = S_LOAD_DWORD_IMM killed %21, 2, 0
+ %23 = S_MOV_B32 491436
+ undef %24.sub0 = COPY killed %22
+ %24.sub1 = COPY killed %23
+ %25 = S_LOAD_DWORDX4_IMM killed %24, 0, 0
+ %1 = COPY killed %25
+ %26 = S_LOAD_DWORDX2_IMM %0, 2, 0
+ dead %27 = S_LOAD_DWORD_IMM killed %26, 0, 0
+ S_CBRANCH_SCC0 %bb.1, implicit undef %scc
+
+ bb.5:
+ %58 = COPY killed %1
+ %59 = COPY killed %17
+ S_BRANCH %bb.2
+
+ bb.1:
+ %30 = V_MOV_B32_e32 1036831949, implicit %exec
+ %31 = V_ADD_F32_e32 %30, %1.sub3, implicit %exec
+ %33 = V_ADD_F32_e32 %30, %1.sub2, implicit %exec
+ %35 = V_ADD_F32_e32 %30, %1.sub1, implicit %exec
+ %37 = V_ADD_F32_e32 killed %30, killed %1.sub0, implicit %exec
+ undef %56.sub0 = COPY killed %37
+ %56.sub1 = COPY killed %35
+ %56.sub2 = COPY killed %33
+ %56.sub3 = COPY killed %31
+ %28 = S_MOV_B32 0
+ %2 = COPY killed %56
+ %58 = COPY killed %2
+ %59 = COPY killed %28
+
+ bb.2:
+ %4 = COPY killed %59
+ %3 = COPY killed %58
+ %39 = S_LOAD_DWORDX2_IMM killed %0, 6, 0
+ %40 = S_LOAD_DWORD_IMM killed %39, 0, 0
+ %43 = V_MOV_B32_e32 -1102263091, implicit %exec
+ %60 = COPY killed %4
+ %61 = COPY killed %3
+
+ bb.3:
+ successors: %bb.3, %bb.4
+
+ %7 = COPY killed %61
+ %6 = COPY killed %60
+ %8 = S_ADD_I32 killed %6, 1, implicit-def dead %scc
+ %44 = V_ADD_F32_e32 %43, %7.sub3, implicit %exec
+ %46 = V_ADD_F32_e32 %43, %7.sub2, implicit %exec
+ %48 = V_ADD_F32_e32 %43, %7.sub1, implicit %exec
+ %50 = V_ADD_F32_e32 %43, killed %7.sub0, implicit %exec
+ undef %57.sub0 = COPY killed %50
+ %57.sub1 = COPY killed %48
+ %57.sub2 = COPY %46
+ %57.sub3 = COPY killed %44
+ S_CMP_LT_I32 %8, %40, implicit-def %scc
+ %60 = COPY killed %8
+ %61 = COPY killed %57
+ S_CBRANCH_SCC1 %bb.3, implicit killed %scc
+ S_BRANCH %bb.4
+
+ bb.4:
+ EXP 32, undef %53, undef %54, killed %46, undef %55, 0, 0, 15, implicit %exec
+ S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/reorder-stores.ll b/test/CodeGen/AMDGPU/reorder-stores.ll
index ff4069226a62..260b32ed3406 100644
--- a/test/CodeGen/AMDGPU/reorder-stores.ll
+++ b/test/CodeGen/AMDGPU/reorder-stores.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store:
; SI: buffer_load_dwordx4
diff --git a/test/CodeGen/AMDGPU/rotl.i64.ll b/test/CodeGen/AMDGPU/rotl.i64.ll
index 266490718dd1..fa29d789cebe 100644
--- a/test/CodeGen/AMDGPU/rotl.i64.ll
+++ b/test/CodeGen/AMDGPU/rotl.i64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
; BOTH-LABEL: {{^}}s_rotl_i64:
; BOTH-DAG: s_lshl_b64
diff --git a/test/CodeGen/AMDGPU/rotr.i64.ll b/test/CodeGen/AMDGPU/rotr.i64.ll
index 9eda479cd25c..af58b404ca6c 100644
--- a/test/CodeGen/AMDGPU/rotr.i64.ll
+++ b/test/CodeGen/AMDGPU/rotr.i64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
; BOTH-LABEL: {{^}}s_rotr_i64:
; BOTH-DAG: s_sub_i32
diff --git a/test/CodeGen/AMDGPU/rsq.ll b/test/CodeGen/AMDGPU/rsq.ll
index 9462683efe0e..204eeb998386 100644
--- a/test/CodeGen/AMDGPU/rsq.ll
+++ b/test/CodeGen/AMDGPU/rsq.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare float @llvm.sqrt.f32(float) nounwind readnone
@@ -48,8 +48,8 @@ define amdgpu_kernel void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float
; SI-UNSAFE-DAG: v_rsq_f32_e32 [[RSQA:v[0-9]+]], [[A]]
; SI-UNSAFE-DAG: v_rcp_f32_e32 [[RCPB:v[0-9]+]], [[B]]
-; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RCPB]], [[RSQA]]
-; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
+; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RSQA]], [[RCPB]]
+; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
; SI-UNSAFE: buffer_store_dword [[RESULT]]
; SI-SAFE-NOT: v_rsq_f32
diff --git a/test/CodeGen/AMDGPU/s_movk_i32.ll b/test/CodeGen/AMDGPU/s_movk_i32.ll
index a131aaa3dfb4..797fbc2712b0 100644
--- a/test/CodeGen/AMDGPU/s_movk_i32.ll
+++ b/test/CodeGen/AMDGPU/s_movk_i32.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; SI-LABEL: {{^}}s_movk_i32_k0:
; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}}
diff --git a/test/CodeGen/AMDGPU/sad.ll b/test/CodeGen/AMDGPU/sad.ll
index f7a1c65881d0..ee56e9053fd3 100644
--- a/test/CodeGen/AMDGPU/sad.ll
+++ b/test/CodeGen/AMDGPU/sad.ll
@@ -134,8 +134,8 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out,
; GCN-LABEL: {{^}}v_sad_u32_multi_use_select_pat2:
; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
%icmp0 = icmp ugt i32 %a, %b
%sub0 = sub i32 %a, %b
diff --git a/test/CodeGen/AMDGPU/saddo.ll b/test/CodeGen/AMDGPU/saddo.ll
index 586a455b2b91..09e87d524419 100644
--- a/test/CodeGen/AMDGPU/saddo.ll
+++ b/test/CodeGen/AMDGPU/saddo.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s
declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll
index 6e1dd1638333..d5b2fa0b6754 100644
--- a/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare i32 @llvm.amdgcn.workitem.id.y() #0
diff --git a/test/CodeGen/AMDGPU/scalar_to_vector.ll b/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 62d0d9367885..0f09fa17423e 100644
--- a/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -1,12 +1,12 @@
-; RUN: llc -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; XXX - Why the packing?
; GCN-LABEL: {{^}}scalar_to_vector_v2i32:
; GCN: buffer_load_dword [[VAL:v[0-9]+]],
; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, [[VAL]]
; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[SHR]]
-; GCN: v_or_b32_e32 v[[OR:[0-9]+]], [[SHL]], [[SHR]]
+; GCN: v_or_b32_e32 v[[OR:[0-9]+]], [[SHR]], [[SHL]]
; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]]
; GCN: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}}
define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
diff --git a/test/CodeGen/AMDGPU/schedule-global-loads.ll b/test/CodeGen/AMDGPU/schedule-global-loads.ll
index 44d46086f02a..2dddba8bccc7 100644
--- a/test/CodeGen/AMDGPU/schedule-global-loads.ll
+++ b/test/CodeGen/AMDGPU/schedule-global-loads.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
; FIXME: This currently doesn't do a great job of clustering the
; loads, which end up with extra moves between them. Right now, it
diff --git a/test/CodeGen/AMDGPU/scratch-buffer.ll b/test/CodeGen/AMDGPU/scratch-buffer.ll
index 6b1e85915a11..4ae9871865f5 100644
--- a/test/CodeGen/AMDGPU/scratch-buffer.ll
+++ b/test/CodeGen/AMDGPU/scratch-buffer.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
; When a frame index offset is more than 12-bits, make sure we don't store
; it in mubuf's offset field.
diff --git a/test/CodeGen/AMDGPU/scratch-simple.ll b/test/CodeGen/AMDGPU/scratch-simple.ll
index abd15f1fb47f..6ed730ad60f4 100644
--- a/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -12,10 +12,8 @@
; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
-; GCN-DAG: v_mov_b32_e32 [[C200:v[0-9]+]], 0x200
-; GCN-DAG: v_mov_b32_e32 [[C400:v[0-9]+]], 0x400
-; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[C200]], [[CLAMP_IDX]]
-; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[C400]], [[CLAMP_IDX]]
+; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]]
+; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]]
; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
diff --git a/test/CodeGen/AMDGPU/sdiv.ll b/test/CodeGen/AMDGPU/sdiv.ll
index 7ec6ca809b68..305107f690fb 100644
--- a/test/CodeGen/AMDGPU/sdiv.ll
+++ b/test/CodeGen/AMDGPU/sdiv.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; The code generated by sdiv is long and complex and may frequently change.
; The goal of this test is to make sure the ISel doesn't fail.
diff --git a/test/CodeGen/AMDGPU/sdwa-peephole.ll b/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 0dc7cc309f7c..0d181c2c34b8 100644
--- a/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s
; GCN-LABEL: {{^}}add_shr_i32:
; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
@@ -35,7 +35,7 @@ define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
; GCN-LABEL: {{^}}mul_shr_i32:
; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
-; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST1]], v[[DST0]]
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST0]], v[[DST1]]
; NOSDWA-NOT: v_mul_u32_u24_sdwa
; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -68,9 +68,9 @@ entry:
; GCN-LABEL: {{^}}mul_v2i16:
; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
-; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]]
+; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]]
; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
-; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]]
; NOSDWA-NOT: v_mul_u32_u24_sdwa
; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
@@ -168,14 +168,14 @@ entry:
; GCN-LABEL: {{^}}mul_v2half:
; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
-; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]]
+; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]]
; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
-; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]]
; NOSDWA-NOT: v_mul_f16_sdwa
; VI-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
-; VI: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]]
+; VI: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]]
; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
@@ -362,9 +362,9 @@ entry:
; GCN-LABEL: {{^}}mac_v2half:
; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
-; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST1]], v[[DST0]]
+; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST0]], v[[DST1]]
; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
-; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]]
; NOSDWA-NOT: v_mac_f16_sdwa
; VI: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -491,7 +491,7 @@ entry:
%tmp17 = shufflevector <2 x i8> %tmp10, <2 x i8> %tmp12, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%tmp18 = shufflevector <2 x i8> %tmp14, <2 x i8> %tmp16, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%tmp19 = shufflevector <4 x i8> %tmp17, <4 x i8> %tmp18, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-
+
%arrayidx5 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %destValues, i64 %idxprom
store <8 x i8> %tmp19, <8 x i8> addrspace(1)* %arrayidx5, align 8
ret void
diff --git a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
index 3417eb02b361..e0619251f920 100644
--- a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
+++ b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
@@ -103,7 +103,7 @@ define amdgpu_kernel void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 {
; GCN: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_ABS]], vcc
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
define amdgpu_kernel void @add_select_fabs_var_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -122,7 +122,7 @@ define amdgpu_kernel void @add_select_fabs_var_f32(i32 %c) #0 {
; GCN: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
define amdgpu_kernel void @add_select_fabs_negk_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -154,7 +154,7 @@ define amdgpu_kernel void @add_select_fabs_negk_negk_f32(i32 %c) #0 {
; GCN: buffer_load_dword [[X:v[0-9]+]]
; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 1.0, 2.0, s
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
define amdgpu_kernel void @add_select_posk_posk_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%cmp = icmp eq i32 %c, 0
@@ -171,7 +171,7 @@ define amdgpu_kernel void @add_select_posk_posk_f32(i32 %c) #0 {
; GCN-DAG: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -191,7 +191,7 @@ define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 {
; GCN-DAG: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[FABS_X]], vcc
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
define amdgpu_kernel void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -245,7 +245,7 @@ define amdgpu_kernel void @add_select_posk_fabs_f32(i32 %c) #0 {
; GCN: buffer_load_dword [[Z:v[0-9]+]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
-; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
define amdgpu_kernel void @add_select_fneg_fneg_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -266,8 +266,8 @@ define amdgpu_kernel void @add_select_fneg_fneg_f32(i32 %c) #0 {
; GCN: buffer_load_dword [[W:v[0-9]+]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
-; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
-; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[X]], [[W]]
+; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[W]], [[X]]
define amdgpu_kernel void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -291,7 +291,7 @@ define amdgpu_kernel void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
; GCN-DAG: v_xor_b32_e32 [[NEG_X:v[0-9]+]], 0x80000000, [[X]]
; GCN-DAG: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
-; GCN-DAG: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[SELECT]], [[Z]]
+; GCN-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[Z]], [[SELECT]]
; GCN: buffer_store_dword [[ADD]]
; GCN: buffer_store_dword [[NEG_X]]
@@ -316,8 +316,8 @@ define amdgpu_kernel void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c)
; GCN: buffer_load_dword [[W:v[0-9]+]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
-; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
-; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[Y]], [[W]]
+; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[W]], [[Y]]
define amdgpu_kernel void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -341,7 +341,7 @@ define amdgpu_kernel void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
; GCN: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_NEG]], vcc
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
define amdgpu_kernel void @add_select_fneg_var_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -359,7 +359,7 @@ define amdgpu_kernel void @add_select_fneg_var_f32(i32 %c) #0 {
; GCN: buffer_load_dword [[Y:v[0-9]+]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
-; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
define amdgpu_kernel void @add_select_fneg_negk_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -377,7 +377,7 @@ define amdgpu_kernel void @add_select_fneg_negk_f32(i32 %c) #0 {
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
-; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -397,7 +397,7 @@ define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc
-; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
define amdgpu_kernel void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -414,7 +414,7 @@ define amdgpu_kernel void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
; GCN: v_cmp_eq_u32_e64
; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%cmp = icmp eq i32 %c, 0
@@ -431,7 +431,7 @@ define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 {
; GCN: v_cmp_eq_u32_e64
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K1]], [[K0]], vcc
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
define amdgpu_kernel void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%cmp = icmp eq i32 %c, 0
@@ -445,7 +445,7 @@ define amdgpu_kernel void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
; GCN: buffer_load_dword [[X:v[0-9]+]]
; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
-; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
define amdgpu_kernel void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%cmp = icmp eq i32 %c, 0
@@ -462,7 +462,7 @@ define amdgpu_kernel void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
-; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
define amdgpu_kernel void @add_select_negk_fneg_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -479,7 +479,7 @@ define amdgpu_kernel void @add_select_negk_fneg_f32(i32 %c) #0 {
; GCN: buffer_load_dword [[Y:v[0-9]+]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
-; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
define amdgpu_kernel void @add_select_fneg_posk_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -497,7 +497,7 @@ define amdgpu_kernel void @add_select_fneg_posk_f32(i32 %c) #0 {
; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
-; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
define amdgpu_kernel void @add_select_posk_fneg_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -517,7 +517,7 @@ define amdgpu_kernel void @add_select_posk_fneg_f32(i32 %c) #0 {
; GCN-DAG: v_or_b32_e32 [[X_NEG_ABS:v[0-9]+]], 0x80000000, [[X]]
; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG_ABS]], vcc
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
define amdgpu_kernel void @add_select_negfabs_fabs_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -540,7 +540,7 @@ define amdgpu_kernel void @add_select_negfabs_fabs_f32(i32 %c) #0 {
; GCN-DAG: v_or_b32_e32 [[Y_NEG_ABS:v[0-9]+]], 0x80000000, [[Y]]
; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG_ABS]], [[X_ABS]], vcc
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
define amdgpu_kernel void @add_select_fabs_negfabs_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -563,7 +563,7 @@ define amdgpu_kernel void @add_select_fabs_negfabs_f32(i32 %c) #0 {
; GCN-DAG: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]]
; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG]], vcc
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
define amdgpu_kernel void @add_select_neg_fabs_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -585,7 +585,7 @@ define amdgpu_kernel void @add_select_neg_fabs_f32(i32 %c) #0 {
; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
; GCN-DAG: v_xor_b32_e32 [[Y_NEG:v[0-9]+]], 0x80000000, [[Y]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG]], [[X_ABS]], vcc
-; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
define amdgpu_kernel void @add_select_fabs_neg_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -606,7 +606,7 @@ define amdgpu_kernel void @add_select_fabs_neg_f32(i32 %c) #0 {
; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X]], vcc
-; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
define amdgpu_kernel void @add_select_neg_negfabs_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
@@ -628,7 +628,7 @@ define amdgpu_kernel void @add_select_neg_negfabs_f32(i32 %c) #0 {
; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[X_ABS]], [[Y]], vcc
-; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
define amdgpu_kernel void @add_select_negfabs_neg_f32(i32 %c) #0 {
%x = load volatile float, float addrspace(1)* undef
%y = load volatile float, float addrspace(1)* undef
diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll
index ebbc675b2bab..b77ebcf5bf52 100644
--- a/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/test/CodeGen/AMDGPU/select-vectors.ll
@@ -1,6 +1,6 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
; Test expansion of scalar selects on vectors.
; Evergreen not enabled since it seems to be having problems with doubles.
diff --git a/test/CodeGen/AMDGPU/select.f16.ll b/test/CodeGen/AMDGPU/select.f16.ll
index 92ee2eb7f403..e79ce3af0cf9 100644
--- a/test/CodeGen/AMDGPU/select.f16.ll
+++ b/test/CodeGen/AMDGPU/select.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}select_f16:
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
@@ -104,8 +104,8 @@ entry:
; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[D_F32]], vcc
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x3800{{$}}
+; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[C_F16]], v[[D_F16]], vcc
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
@@ -134,8 +134,8 @@ entry:
; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[C_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
; VI: v_mov_b32_e32 v[[D_F16:[0-9]+]], 0x3800{{$}}
+; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
@@ -159,16 +159,16 @@ entry:
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
-; SI: v_cmp_lt_f32_e64
; SI: v_cmp_lt_f32_e32
; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e64
+; SI: v_cmp_lt_f32_e32
+; SI: v_cndmask_b32_e32
; SI: v_cvt_f16_f32_e32
; SI: v_cvt_f16_f32_e32
-; VI: v_cmp_lt_f16_e64
; VI: v_cmp_lt_f16_e32
-; VI: v_cndmask_b32_e64
+; VI: v_cndmask_b32_e32
+; VI: v_cmp_lt_f16_e32
; VI: v_cndmask_b32_e32
; GCN: s_endpgm
@@ -196,13 +196,17 @@ entry:
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
-; SI-DAG: v_cmp_gt_f32_e64
-; SI-DAG: v_cmp_lt_f32_e32 vcc, 0.5
-; VI: v_cmp_lt_f16_e32
-; VI: v_cmp_gt_f16_e64
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e64
+; SI: v_cmp_lt_f32_e32 vcc, 0.5
+; SI: v_cndmask_b32_e32
+; SI: v_cmp_gt_f32_e32
+; SI: v_cndmask_b32_e32
+
+; VI: v_cmp_lt_f16_e32
+; VI: v_cndmask_b32_e32
+; VI: v_cmp_gt_f16_e32
+; VI: v_cndmask_b32_e32
+
; SI: v_cvt_f16_f32_e32
; SI: v_cvt_f16_f32_e32
; GCN: s_endpgm
@@ -228,13 +232,16 @@ entry:
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
-; SI-DAG: v_cmp_lt_f32_e64
-; SI-DAG: v_cmp_gt_f32_e32 vcc, 0.5
-; VI: v_cmp_gt_f16_e32
-; VI: v_cmp_lt_f16_e64
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e64
+; SI: v_cmp_gt_f32_e32 vcc, 0.5
+; SI: v_cndmask_b32_e32
+; SI: v_cmp_lt_f32_e32
+; SI: v_cndmask_b32_e32
+
+; VI: v_cmp_gt_f16_e32
+; VI: v_cndmask_b32_e32
+; VI: v_cmp_lt_f16_e32
+; VI: v_cndmask_b32_e32
; SI: v_cvt_f16_f32_e32
; SI: v_cvt_f16_f32_e32
@@ -263,8 +270,8 @@ entry:
; SI: v_cvt_f32_f16_e32
; SI: v_cmp_nlt_f32_e32
-; SI: v_cmp_nlt_f32_e64
-; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
+; SI: v_cmp_nlt_f32_e32
; SI: v_cndmask_b32_e32
; VI: v_cmp_nlt_f16_e32
@@ -298,13 +305,17 @@ entry:
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
-; SI: v_cmp_lt_f32_e64
+
; SI: v_cmp_lt_f32_e32
+; SI: v_cndmask_b32
+; SI: v_cmp_lt_f32_e32
+; SI: v_cndmask_b32
; VI: v_cmp_lt_f16_e32
-; VI: v_cmp_lt_f16_e64
-; GCN: v_cndmask_b32
-; GCN: v_cndmask_b32
+; VI: v_cndmask_b32
+; VI: v_cmp_lt_f16_e32
+; VI: v_cndmask_b32
+
; SI: v_cvt_f16_f32_e32
; SI: v_cvt_f16_f32_e32
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/setcc-fneg-constant.ll b/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
index 8d455d84bf9e..bcaa1aa54c15 100644
--- a/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
+++ b/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
@@ -7,7 +7,7 @@
; GCN: buffer_load_dword [[B:v[0-9]+]]
; GCN: buffer_load_dword [[C:v[0-9]+]]
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]]
; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL]]
; GCN: buffer_store_dword [[MUL]]
define amdgpu_kernel void @multi_use_fneg_src() #0 {
@@ -30,7 +30,7 @@ define amdgpu_kernel void @multi_use_fneg_src() #0 {
; GCN: buffer_load_dword [[B:v[0-9]+]]
; GCN: buffer_load_dword [[C:v[0-9]+]]
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]]
; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[A]]
; GCN: v_mul_f32_e64 [[USE1:v[0-9]+]], [[MUL]], -[[MUL]]
define amdgpu_kernel void @multi_foldable_use_fneg_src() #0 {
@@ -78,7 +78,7 @@ define amdgpu_kernel void @multi_use_fneg() #0 {
; GCN: buffer_load_dword [[A:v[0-9]+]]
; GCN: buffer_load_dword [[B:v[0-9]+]]
-; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[B]], [[A]]
+; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]]
; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL0]]
; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[MUL0]], [[MUL0]]
; GCN: buffer_store_dword [[MUL1]]
diff --git a/test/CodeGen/AMDGPU/setcc.ll b/test/CodeGen/AMDGPU/setcc.ll
index f63719d62a84..a3bf167e756a 100644
--- a/test/CodeGen/AMDGPU/setcc.ll
+++ b/test/CodeGen/AMDGPU/setcc.ll
@@ -7,8 +7,8 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z
; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y
-; GCN-DAG: v_cmp_eq_u32_e32
-; GCN-DAG: v_cmp_eq_u32_e64
+; GCN: v_cmp_eq_u32_e32
+; GCN: v_cmp_eq_u32_e32
define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
%result = icmp eq <2 x i32> %a, %b
%sext = sext <2 x i1> %result to <2 x i32>
@@ -23,9 +23,9 @@ define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; GCN: v_cmp_eq_u32_e32
-; GCN: v_cmp_eq_u32_e64
-; GCN: v_cmp_eq_u32_e64
-; GCN: v_cmp_eq_u32_e64
+; GCN: v_cmp_eq_u32_e32
+; GCN: v_cmp_eq_u32_e32
+; GCN: v_cmp_eq_u32_e32
define amdgpu_kernel void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
%b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
%a = load <4 x i32>, <4 x i32> addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll
index 160fb6a038fe..5b4d9ed259b6 100644
--- a/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FIXME: i16 promotion pass ruins the scalar cases when legal.
; FIXME: r600 fails verifier
diff --git a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
index fb0bbaa9cbf2..8250bad7b0a1 100644
--- a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
+++ b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
; Copy VGPR -> SGPR used twice as an instruction operand, which is then
; used in an REG_SEQUENCE that also needs to be handled.
diff --git a/test/CodeGen/AMDGPU/sgpr-copy.ll b/test/CodeGen/AMDGPU/sgpr-copy.ll
index 931051102cd5..3b24cf82d783 100644
--- a/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
; CHECK-LABEL: {{^}}phi1:
; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
diff --git a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
index 4f7b61adc91d..2f9eed457ab6 100644
--- a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
+++ b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; Extract the high bit of the 1st quarter
; GCN-LABEL: {{^}}v_uextract_bit_31_i128:
@@ -98,7 +98,7 @@ define amdgpu_kernel void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128
; GCN-DAG: v_lshrrev_b32_e32 v[[ELT1PART:[0-9]+]], 2, v{{[[0-9]+}}
; GCN-DAG: v_bfe_u32 v[[ELT2PART:[0-9]+]], v[[VAL3]], 2, 2{{$}}
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[SHLLO]], v[[ELT1PART]]
+; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[ELT1PART]], v[[SHLLO]]
; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]{{$}}
; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
diff --git a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
index c70eb9b9c4a5..670287ba7937 100644
--- a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
+++ b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; Make sure 64-bit BFE pattern does a 32-bit BFE on the relevant half.
diff --git a/test/CodeGen/AMDGPU/shift-i64-opts.ll b/test/CodeGen/AMDGPU/shift-i64-opts.ll
index 5306e190a4f9..f3faa39c64e6 100644
--- a/test/CodeGen/AMDGPU/shift-i64-opts.ll
+++ b/test/CodeGen/AMDGPU/shift-i64-opts.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=FAST64 -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=SLOW64 -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=FAST64 -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=SLOW64 -check-prefix=GCN %s
; lshr (i64 x), c: c > 32 => reg_sequence lshr (i32 hi_32(x)), (c - 32), 0
diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll
index edc313ee323b..13ac9140b827 100644
--- a/test/CodeGen/AMDGPU/shl.ll
+++ b/test/CodeGen/AMDGPU/shl.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare i32 @llvm.r600.read.tidig.x() #0
diff --git a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
index 6248d8a46daf..767118eb8d11 100644
--- a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
+++ b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
@@ -6,92 +6,7 @@
# that the post-RA run does manage to shrink it, but right now the
# resume crashes
---- |
- define amdgpu_kernel void @shrink_add_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = sext i32 %tid to i64
- %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
- %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
- %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
- %a = load volatile i32, i32 addrspace(1)* %a.ptr
- %b = load volatile i32, i32 addrspace(1)* %b.ptr
- %result = add i32 %a, %b
- store volatile i32 %result, i32 addrspace(1)* %out.gep
- ret void
- }
-
- define amdgpu_kernel void @shrink_sub_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = sext i32 %tid to i64
- %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
- %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
- %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
- %a = load volatile i32, i32 addrspace(1)* %a.ptr
- %b = load volatile i32, i32 addrspace(1)* %b.ptr
- %result = sub i32 %a, %b
- store volatile i32 %result, i32 addrspace(1)* %out.gep
- ret void
- }
-
- define amdgpu_kernel void @shrink_subrev_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = sext i32 %tid to i64
- %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
- %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
- %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
- %a = load volatile i32, i32 addrspace(1)* %a.ptr
- %b = load volatile i32, i32 addrspace(1)* %b.ptr
- %result = sub i32 %a, %b
- store volatile i32 %result, i32 addrspace(1)* %out.gep
- ret void
- }
-
- define amdgpu_kernel void @check_addc_src2_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = sext i32 %tid to i64
- %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
- %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
- %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
- %a = load volatile i32, i32 addrspace(1)* %a.ptr
- %b = load volatile i32, i32 addrspace(1)* %b.ptr
- %result = add i32 %a, %b
- store volatile i32 %result, i32 addrspace(1)* %out.gep
- ret void
- }
-
- define amdgpu_kernel void @shrink_addc_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = sext i32 %tid to i64
- %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
- %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
- %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
- %a = load volatile i32, i32 addrspace(1)* %a.ptr
- %b = load volatile i32, i32 addrspace(1)* %b.ptr
- %result = add i32 %a, %b
- store volatile i32 %result, i32 addrspace(1)* %out.gep
- ret void
- }
-
- define amdgpu_kernel void @shrink_addc_undef_vcc(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %tid.ext = sext i32 %tid to i64
- %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
- %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
- %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
- %a = load volatile i32, i32 addrspace(1)* %a.ptr
- %b = load volatile i32, i32 addrspace(1)* %b.ptr
- %result = add i32 %a, %b
- store volatile i32 %result, i32 addrspace(1)* %out.gep
- ret void
- }
-
- declare i32 @llvm.amdgcn.workitem.id.x() #1
-
- attributes #0 = { nounwind }
- attributes #1 = { nounwind readnone }
-
...
----
# GCN-LABEL: name: shrink_add_vop3{{$}}
# GCN: %29, %9 = V_ADD_I32_e64 %19, %17, implicit %exec
# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
@@ -151,13 +66,13 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0
%26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
%27 = REG_SEQUENCE %3, 1, %26, 2
%10 = S_MOV_B32 61440
@@ -166,11 +81,11 @@ body: |
%13 = REG_SEQUENCE killed %5, 17, %12, 18
%28 = V_LSHL_B64 killed %27, 2, implicit %exec
%16 = REG_SEQUENCE killed %4, 17, %12, 18
- %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
- %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+ %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec
+ %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec
%29, %9 = V_ADD_I32_e64 %19, %17, implicit %exec
%24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
- BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+ BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec
S_ENDPGM
...
@@ -235,13 +150,13 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0
%26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
%27 = REG_SEQUENCE %3, 1, %26, 2
%10 = S_MOV_B32 61440
@@ -250,11 +165,11 @@ body: |
%13 = REG_SEQUENCE killed %5, 17, %12, 18
%28 = V_LSHL_B64 killed %27, 2, implicit %exec
%16 = REG_SEQUENCE killed %4, 17, %12, 18
- %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
- %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+ %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec
+ %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec
%29, %9 = V_SUB_I32_e64 %19, %17, implicit %exec
%24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
- BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+ BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec
S_ENDPGM
...
@@ -319,13 +234,13 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0
%26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
%27 = REG_SEQUENCE %3, 1, %26, 2
%10 = S_MOV_B32 61440
@@ -334,11 +249,11 @@ body: |
%13 = REG_SEQUENCE killed %5, 17, %12, 18
%28 = V_LSHL_B64 killed %27, 2, implicit %exec
%16 = REG_SEQUENCE killed %4, 17, %12, 18
- %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
- %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+ %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec
+ %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec
%29, %9 = V_SUBREV_I32_e64 %19, %17, implicit %exec
%24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
- BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+ BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec
S_ENDPGM
...
@@ -402,13 +317,13 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0
%26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
%27 = REG_SEQUENCE %3, 1, %26, 2
%10 = S_MOV_B32 61440
@@ -417,18 +332,18 @@ body: |
%13 = REG_SEQUENCE killed %5, 17, %12, 18
%28 = V_LSHL_B64 killed %27, 2, implicit %exec
%16 = REG_SEQUENCE killed %4, 17, %12, 18
- %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
- %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+ %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec
+ %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec
%9 = S_MOV_B64 0
%29, %vcc = V_ADDC_U32_e64 %19, %17, %9, implicit %exec
%24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
- BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+ BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec
S_ENDPGM
...
---
# GCN-LABEL: name: shrink_addc_vop3{{$}}
-# GCN: %29 = V_ADDC_U32_e32 %17, %19, implicit-def %vcc, implicit %vcc, implicit %exec
+# GCN: %29 = V_ADDC_U32_e32 %19, %17, implicit-def %vcc, implicit %vcc, implicit %exec
# GCN %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
name: shrink_addc_vop3
@@ -487,13 +402,13 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0
%26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
%27 = REG_SEQUENCE %3, 1, %26, 2
%10 = S_MOV_B32 61440
@@ -502,19 +417,19 @@ body: |
%13 = REG_SEQUENCE killed %5, 17, %12, 18
%28 = V_LSHL_B64 killed %27, 2, implicit %exec
%16 = REG_SEQUENCE killed %4, 17, %12, 18
- %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
- %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+ %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec
+ %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec
%vcc = S_MOV_B64 0
%29, %vcc = V_ADDC_U32_e64 %19, %17, %vcc, implicit %exec
%24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
- BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+ BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec
S_ENDPGM
...
---
# GCN-LABEL: name: shrink_addc_undef_vcc{{$}}
-# GCN: %29 = V_ADDC_U32_e32 %17, %19, implicit-def %vcc, implicit undef %vcc, implicit %exec
+# GCN: %29 = V_ADDC_U32_e32 %19, %17, implicit-def %vcc, implicit undef %vcc, implicit %exec
# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
name: shrink_addc_undef_vcc
alignment: 0
@@ -572,13 +487,13 @@ frameInfo:
hasVAStart: false
hasMustTailInVarArgFunc: false
body: |
- bb.0 (%ir-block.0):
+ bb.0:
liveins: %sgpr0_sgpr1, %vgpr0
%3 = COPY %vgpr0
%0 = COPY %sgpr0_sgpr1
- %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
- %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0
%26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
%27 = REG_SEQUENCE %3, 1, %26, 2
%10 = S_MOV_B32 61440
@@ -587,11 +502,11 @@ body: |
%13 = REG_SEQUENCE killed %5, 17, %12, 18
%28 = V_LSHL_B64 killed %27, 2, implicit %exec
%16 = REG_SEQUENCE killed %4, 17, %12, 18
- %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
- %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+ %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec
+ %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec
%29, %vcc = V_ADDC_U32_e64 %19, %17, undef %vcc, implicit %exec
%24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
- BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+ BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec
S_ENDPGM
...
diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index 348c7200c0bc..17109187d538 100644
--- a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
diff --git a/test/CodeGen/AMDGPU/sign_extend.ll b/test/CodeGen/AMDGPU/sign_extend.ll
index 3e452c214e98..c80945f390be 100644
--- a/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/test/CodeGen/AMDGPU/sign_extend.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
; GCN-LABEL: {{^}}s_sext_i1_to_i32:
; GCN: v_cndmask_b32_e64
diff --git a/test/CodeGen/AMDGPU/sitofp.f16.ll b/test/CodeGen/AMDGPU/sitofp.f16.ll
index 574d1c0b2c78..0bcef99df39f 100644
--- a/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}sitofp_i16_to_f16
; GCN: buffer_load_{{sshort|ushort}} v[[A_I16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/sminmax.ll b/test/CodeGen/AMDGPU/sminmax.ll
index 827d672022eb..41430715f347 100644
--- a/test/CodeGen/AMDGPU/sminmax.ll
+++ b/test/CodeGen/AMDGPU/sminmax.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}s_abs_i32:
; GCN: s_abs_i32
@@ -18,7 +18,7 @@ define amdgpu_kernel void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind
; FUNC-LABEL: {{^}}v_abs_i32:
; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG]], [[SRC]]
+; GCN: v_max_i32_e32 {{v[0-9]+}}, [[SRC]], [[NEG]]
; GCN: v_add_i32
; EG: MAX_INT
@@ -34,7 +34,7 @@ define amdgpu_kernel void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %
; GCN-LABEL: {{^}}v_abs_i32_repeat_user:
; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]]
-; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[NEG]], [[SRC]]
+; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[SRC]], [[NEG]]
; GCN: v_mul_lo_i32 v{{[0-9]+}}, [[MAX]], [[MAX]]
define amdgpu_kernel void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
%val = load i32, i32 addrspace(1)* %src, align 4
@@ -71,8 +71,8 @@ define amdgpu_kernel void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
-; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
-; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC0]], [[NEG0]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC1]], [[NEG1]]
; GCN: v_add_i32
; GCN: v_add_i32
@@ -132,10 +132,10 @@ define amdgpu_kernel void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %
; GCN-DAG: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]]
; GCN-DAG: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]]
-; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
-; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
-; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]]
-; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC0]], [[NEG0]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC1]], [[NEG1]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC2]], [[NEG2]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC3]], [[NEG3]]
; GCN: v_add_i32
; GCN: v_add_i32
@@ -184,8 +184,8 @@ define amdgpu_kernel void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(
; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]]
-; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]]
+; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL0]], [[VAL1]]
+; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL0]], [[VAL1]]
define amdgpu_kernel void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
%val0 = load volatile i32, i32 addrspace(1)* %ptr0
%val1 = load volatile i32, i32 addrspace(1)* %ptr1
diff --git a/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index a9aac2d8abb7..27263429650d 100644
--- a/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
; GCN-LABEL: {{^}}s_abs_v2i16:
; GFX9: s_load_dword [[VAL:s[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/spill-cfg-position.ll b/test/CodeGen/AMDGPU/spill-cfg-position.ll
index 1ca0919258a8..cbf9f37e29ef 100644
--- a/test/CodeGen/AMDGPU/spill-cfg-position.ll
+++ b/test/CodeGen/AMDGPU/spill-cfg-position.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -stress-regalloc=6 < %s | FileCheck %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -stress-regalloc=6 < %s | FileCheck %s
; Inline spiller can decide to move a spill as early as possible in the basic block.
; It will skip phis and label, but we also need to make sure it skips instructions
diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll
index 44cfdf6398ae..74618b263bad 100644
--- a/test/CodeGen/AMDGPU/sra.ll
+++ b/test/CodeGen/AMDGPU/sra.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare i32 @llvm.r600.read.tidig.x() #0
diff --git a/test/CodeGen/AMDGPU/srem.ll b/test/CodeGen/AMDGPU/srem.ll
index e06725892089..51eaf9a960b0 100644
--- a/test/CodeGen/AMDGPU/srem.ll
+++ b/test/CodeGen/AMDGPU/srem.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s
define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
%den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll
index cb40ecf2de1c..8878b4538555 100644
--- a/test/CodeGen/AMDGPU/srl.ll
+++ b/test/CodeGen/AMDGPU/srl.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare i32 @llvm.r600.read.tidig.x() #0
diff --git a/test/CodeGen/AMDGPU/ssubo.ll b/test/CodeGen/AMDGPU/ssubo.ll
index 135632343f90..d65c2adc7e20 100644
--- a/test/CodeGen/AMDGPU/ssubo.ll
+++ b/test/CodeGen/AMDGPU/ssubo.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s
declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/sub.i16.ll b/test/CodeGen/AMDGPU/sub.i16.ll
index 1d407ea9bcda..14bedceed6ee 100644
--- a/test/CodeGen/AMDGPU/sub.i16.ll
+++ b/test/CodeGen/AMDGPU/sub.i16.ll
@@ -5,7 +5,7 @@
; GCN-LABEL: {{^}}v_test_sub_i16:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI-NEXT: buffer_store_short [[ADD]]
define amdgpu_kernel void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -68,7 +68,7 @@ define amdgpu_kernel void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16
; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i32:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI-NEXT: buffer_store_dword [[ADD]]
define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -88,7 +88,7 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i1
; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
+; VI-DAG: v_sub_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]]
; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -107,7 +107,7 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i1
; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i32:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
; VI-NEXT: buffer_store_dword [[SEXT]]
define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
@@ -127,7 +127,7 @@ define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i1
; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i64:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll
index e7655df15520..46f1b120f212 100644
--- a/test/CodeGen/AMDGPU/sub.ll
+++ b/test/CodeGen/AMDGPU/sub.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare i32 @llvm.r600.read.tidig.x() readnone
diff --git a/test/CodeGen/AMDGPU/sub.v2i16.ll b/test/CodeGen/AMDGPU/sub.v2i16.ll
index ee923e2b8b61..8d5c8b64efb8 100644
--- a/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -6,7 +6,7 @@
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@@ -165,10 +165,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(
; VI: flat_load_ushort v[[B_HI:[0-9]+]]
; VI: flat_load_ushort v[[B_LO:[0-9]+]]
-; VI: v_subrev_u16_e32 v[[ADD_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
+; VI: v_sub_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
; VI-NOT: and
; VI-NOT: shl
-; VI: v_subrev_u16_e32 v[[ADD_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
+; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
; VI-NOT: and
; VI-NOT: shl
; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
@@ -201,8 +201,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
; VI: flat_load_ushort v[[B_LO:[0-9]+]]
; VI: flat_load_ushort v[[B_HI:[0-9]+]]
-; VI-DAG: v_subrev_u16_e32
-; VI-DAG: v_subrev_u16_e32
+; VI: v_sub_u16_e32
+; VI: v_sub_u16_e32
; VI: buffer_store_dwordx4
define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
@@ -228,8 +228,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
-; VI: v_subrev_u16_e32
-; VI: v_subrev_u16_e32
+; VI: v_sub_u16_e32
+; VI: v_sub_u16_e32
; VI: buffer_store_dwordx2
define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -253,7 +253,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)
; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI: v_subrev_u16_e32
+; VI: v_sub_u16_e32
; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
diff --git a/test/CodeGen/AMDGPU/syncscopes.ll b/test/CodeGen/AMDGPU/syncscopes.ll
new file mode 100644
index 000000000000..3741ce788993
--- /dev/null
+++ b/test/CodeGen/AMDGPU/syncscopes.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -stop-before=si-debugger-insert-nops < %s | FileCheck --check-prefix=GCN %s
+
+; GCN-LABEL: name: syncscopes
+; GCN: FLAT_STORE_DWORD killed %vgpr1_vgpr2, killed %vgpr0, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("agent") seq_cst 4 into %ir.agent_out)
+; GCN: FLAT_STORE_DWORD killed %vgpr4_vgpr5, killed %vgpr3, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out)
+; GCN: FLAT_STORE_DWORD killed %vgpr7_vgpr8, killed %vgpr6, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out)
+define void @syncscopes(
+ i32 %agent,
+ i32 addrspace(4)* %agent_out,
+ i32 %workgroup,
+ i32 addrspace(4)* %workgroup_out,
+ i32 %wavefront,
+ i32 addrspace(4)* %wavefront_out) {
+entry:
+ store atomic i32 %agent, i32 addrspace(4)* %agent_out syncscope("agent") seq_cst, align 4
+ store atomic i32 %workgroup, i32 addrspace(4)* %workgroup_out syncscope("workgroup") seq_cst, align 4
+ store atomic i32 %wavefront, i32 addrspace(4)* %wavefront_out syncscope("wavefront") seq_cst, align 4
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
index f90040385f75..77a6820713d6 100644
--- a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
+++ b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32:
; CHECK: buffer_load_dword v
diff --git a/test/CodeGen/AMDGPU/trunc.ll b/test/CodeGen/AMDGPU/trunc.ll
index 0c91d52df0c0..da038f4b0597 100644
--- a/test/CodeGen/AMDGPU/trunc.ll
+++ b/test/CodeGen/AMDGPU/trunc.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
declare i32 @llvm.r600.read.tidig.x() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/uaddo.ll b/test/CodeGen/AMDGPU/uaddo.ll
index 632ccaa7e612..5754bd9bb913 100644
--- a/test/CodeGen/AMDGPU/uaddo.ll
+++ b/test/CodeGen/AMDGPU/uaddo.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
; FUNC-LABEL: {{^}}s_uaddo_i64_zext:
; GCN: s_add_u32
@@ -58,8 +58,8 @@ define amdgpu_kernel void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
}
; FUNC-LABEL: {{^}}v_uaddo_i32_novcc:
-; GCN: v_add_i32_e64 v{{[0-9]+}}, [[COND:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[COND]]
+; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
; EG: ADDC_UINT
; EG: ADD_INT
diff --git a/test/CodeGen/AMDGPU/udiv.ll b/test/CodeGen/AMDGPU/udiv.ll
index d9dab0d40acf..1d683776bfd5 100644
--- a/test/CodeGen/AMDGPU/udiv.ll
+++ b/test/CodeGen/AMDGPU/udiv.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}udiv_i32:
; EG-NOT: SETGE_INT
diff --git a/test/CodeGen/AMDGPU/uitofp.f16.ll b/test/CodeGen/AMDGPU/uitofp.f16.ll
index 0c3b0fcaf854..eaa1d073cafb 100644
--- a/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}uitofp_i16_to_f16
; GCN: buffer_load_ushort v[[A_I16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/urem.ll b/test/CodeGen/AMDGPU/urem.ll
index fb4eab43a2d6..823c918dcda7 100644
--- a/test/CodeGen/AMDGPU/urem.ll
+++ b/test/CodeGen/AMDGPU/urem.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; The code generated by urem is long and complex and may frequently
; change. The goal of this test is to make sure the ISel doesn't fail
diff --git a/test/CodeGen/AMDGPU/usubo.ll b/test/CodeGen/AMDGPU/usubo.ll
index d1f454f0bc65..f01bf498e0d8 100644
--- a/test/CodeGen/AMDGPU/usubo.ll
+++ b/test/CodeGen/AMDGPU/usubo.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
; FUNC-LABEL: {{^}}s_usubo_i64_zext:
; GCN: s_sub_u32
@@ -58,8 +58,8 @@ define amdgpu_kernel void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
}
; FUNC-LABEL: {{^}}v_usubo_i32_novcc:
-; GCN: v_sub_i32_e64 v{{[0-9]+}}, [[COND:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[COND]]
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
; EG-DAG: SUBB_UINT
; EG-DAG: SUB_INT
@@ -120,7 +120,7 @@ define amdgpu_kernel void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
}
; FUNC-LABEL: {{^}}v_usubo_i16:
-; VI: v_subrev_u16_e32
+; VI: v_sub_u16_e32
; VI: v_cmp_gt_u16_e32
define amdgpu_kernel void @v_usubo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/v_cndmask.ll b/test/CodeGen/AMDGPU/v_cndmask.ll
index d4a68a418ee4..5cbfae34e1bb 100644
--- a/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -200,9 +200,9 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %
; SI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc
; SI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc
-; VI-DAG: v_cmp_lt_i64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}}
-; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v[[Z_HI]], s
-; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 2, v[[Z_LO]], s
+; VI-DAG: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}}
+; VI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc
+; VI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc
define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
@@ -292,10 +292,10 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrs
; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i1:
; GCN: load_dword
; GCN: load_ubyte
-; GCN-DAG: v_cmp_gt_i32_e64 s{{\[[0-9]+:[0-9]+\]}}, 0, v
+; GCN-DAG: v_cmp_gt_i32_e32 vcc, 0, v
; DCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 1,
-; GCN-DAG: v_cmp_eq_u32_e32 vcc, 1, v
-; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, vcc
+; GCN-DAG: v_cmp_eq_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, v
+; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, s{{\[[0-9]+:[0-9]+\]}}
; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s
; GCN: store_byte
define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 {
diff --git a/test/CodeGen/AMDGPU/v_mac.ll b/test/CodeGen/AMDGPU/v_mac.ll
index 2b96f7d50076..da57155f33ef 100644
--- a/test/CodeGen/AMDGPU/v_mac.ll
+++ b/test/CodeGen/AMDGPU/v_mac.ll
@@ -1,12 +1,12 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s
; GCN-LABEL: {{^}}mac_vvv:
; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
; GCN: buffer_load_dword [[B:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4
; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
-; GCN: v_mac_f32_e32 [[C]], [[B]], [[A]]
+; GCN: v_mac_f32_e32 [[C]], [[A]], [[B]]
; GCN: buffer_store_dword [[C]]
define amdgpu_kernel void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
entry:
@@ -135,7 +135,7 @@ entry:
; GCN-LABEL: {{^}}safe_mad_sub0_src0:
; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0,
-; GCN: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[SUB0]]
+; GCN: v_mac_f32_e32 v{{[0-9]+}}, [[SUB0]], v{{[0-9]+}}
define amdgpu_kernel void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
entry:
%b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/AMDGPU/v_mac_f16.ll b/test/CodeGen/AMDGPU/v_mac_f16.ll
index ce4a69db3506..46c9b7ee1a3d 100644
--- a/test/CodeGen/AMDGPU/v_mac_f16.ll
+++ b/test/CodeGen/AMDGPU/v_mac_f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}mac_f16:
; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
@@ -8,10 +8,10 @@
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
-; SI: v_mac_f32_e32 v[[C_F32]], v[[B_F32]], v[[A_F32]]
+; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]]
; SI: buffer_store_short v[[R_F16]]
-; VI: v_mac_f16_e32 v[[C_F16]], v[[B_F16]], v[[A_F16]]
+; VI: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]]
; VI: buffer_store_short v[[C_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @mac_f16(
@@ -147,9 +147,9 @@ entry:
; GCN-LABEL: {{^}}mac_f16_neg_a_safe_fp_math:
; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
-; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]]
+; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
-; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]]
+; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
; GCN: s_endpgm
define amdgpu_kernel void @mac_f16_neg_a_safe_fp_math(
half addrspace(1)* %r,
@@ -171,9 +171,9 @@ entry:
; GCN-LABEL: {{^}}mac_f16_neg_b_safe_fp_math:
; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
-; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
+; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]]
; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
-; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
+; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]]
; GCN: s_endpgm
define amdgpu_kernel void @mac_f16_neg_b_safe_fp_math(
half addrspace(1)* %r,
@@ -312,20 +312,20 @@ entry:
; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
-; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]]
+; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
-; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]]
+; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; VI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; VI-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]]
; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
; VI-NOT: and
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[C_V2_F16]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]]
; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
@@ -481,14 +481,14 @@ entry:
; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
-; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
-; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
+; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
+; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
; VI-DAG: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
; VI-DAG: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
+; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
; GCN: s_endpgm
define amdgpu_kernel void @mac_v2f16_neg_a_safe_fp_math(
@@ -513,14 +513,14 @@ entry:
; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
-; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
-; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
+; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
+; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
+; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
; GCN: s_endpgm
define amdgpu_kernel void @mac_v2f16_neg_b_safe_fp_math(
diff --git a/test/CodeGen/AMDGPU/vectorize-global-local.ll b/test/CodeGen/AMDGPU/vectorize-global-local.ll
index 90cf34e609f6..381ff5b1b518 100644
--- a/test/CodeGen/AMDGPU/vectorize-global-local.ll
+++ b/test/CodeGen/AMDGPU/vectorize-global-local.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
; CHECK-DAG: flat_load_dwordx4
; CHECK-DAG: flat_load_dwordx4
; CHECK-DAG: flat_load_dwordx4
diff --git a/test/CodeGen/AMDGPU/vop-shrink-frame-index.mir b/test/CodeGen/AMDGPU/vop-shrink-frame-index.mir
new file mode 100644
index 000000000000..f8a2339626cf
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vop-shrink-frame-index.mir
@@ -0,0 +1,161 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-shrink-instructions -o - %s | FileCheck -check-prefix=GCN %s
+--- |
+
+ define amdgpu_kernel void @fold_fi_vgpr() {
+ %alloca = alloca [4 x i32]
+ ret void
+ }
+
+ define amdgpu_kernel void @fold_vgpr_fi() {
+ %alloca = alloca [4 x i32]
+ ret void
+ }
+
+ define amdgpu_kernel void @fold_sgpr_fi() {
+ %alloca = alloca [4 x i32]
+ ret void
+ }
+
+ define amdgpu_kernel void @fold_fi_sgpr() {
+ %alloca = alloca [4 x i32]
+ ret void
+ }
+
+ define amdgpu_kernel void @fold_fi_imm() {
+ %alloca = alloca [4 x i32]
+ ret void
+ }
+
+ define amdgpu_kernel void @fold_imm_fi() {
+ %alloca = alloca [4 x i32]
+ ret void
+ }
+
+...
+# GCN-LABEL: name: fold_fi_vgpr{{$}}
+# GCN: %1 = IMPLICIT_DEF
+
+# GCN: %2 = V_ADD_I32_e32 %stack.0.alloca, %1, implicit-def %vcc, implicit %exec
+name: fold_fi_vgpr
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+stack:
+ - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8,
+ callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '',
+ di-location: '' }
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec
+ %1 = IMPLICIT_DEF
+ %2, %vcc = V_ADD_I32_e64 %0, %1, implicit %exec
+ S_ENDPGM
+
+...
+# GCN-LABEL: name: fold_vgpr_fi{{$}}
+# GCN: %1 = IMPLICIT_DEF
+# GCN: %2 = V_ADD_I32_e32 %stack.0.alloca, %1, implicit-def %vcc, implicit %exec
+name: fold_vgpr_fi
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+stack:
+ - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8,
+ callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '',
+ di-location: '' }
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec
+ %1 = IMPLICIT_DEF
+ %2, %vcc = V_ADD_I32_e64 %1, %0, implicit %exec
+ S_ENDPGM
+
+...
+# GCN-LABEL: name: fold_sgpr_fi{{$}}
+# GCN: %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec
+# GCN: %1 = IMPLICIT_DEF
+# GCN: %2 = V_ADD_I32_e32 %1, %0, implicit-def %vcc, implicit %exec
+name: fold_sgpr_fi
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: sgpr_32 }
+ - { id: 2, class: vgpr_32 }
+stack:
+ - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8,
+ callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '',
+ di-location: '' }
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec
+ %1 = IMPLICIT_DEF
+ %2, %vcc = V_ADD_I32_e64 %1, %0, implicit %exec
+ S_ENDPGM
+
+...
+# GCN-LABEL: name: fold_fi_sgpr{{$}}
+# GCN: %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec
+# GCN: %1 = IMPLICIT_DEF
+# GCN: %2 = V_ADD_I32_e32 %1, %0, implicit-def %vcc, implicit %exec
+name: fold_fi_sgpr
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: sgpr_32 }
+ - { id: 2, class: vgpr_32 }
+stack:
+ - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8,
+ callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '',
+ di-location: '' }
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec
+ %1 = IMPLICIT_DEF
+ %2, %vcc = V_ADD_I32_e64 %0, %1, implicit %exec
+ S_ENDPGM
+...
+# TODO: Should probably prefer folding immediate first
+# GCN-LABEL: name: fold_fi_imm{{$}}
+# GCN: %1 = V_MOV_B32_e32 999, implicit %exec
+# GCN: %2 = V_ADD_I32_e32 %stack.0.alloca, %1, implicit-def %vcc, implicit %exec
+name: fold_fi_imm
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+stack:
+ - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8,
+ callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '',
+ di-location: '' }
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec
+ %1 = V_MOV_B32_e32 999, implicit %exec
+ %2, %vcc = V_ADD_I32_e64 %0, %1, implicit %exec
+ S_ENDPGM
+
+...
+# GCN-LABEL: name: fold_imm_fi{{$}}
+# GCN: %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec
+# GCN: %2 = V_ADD_I32_e32 999, %0, implicit-def %vcc, implicit %exec
+name: fold_imm_fi
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+stack:
+ - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8,
+ callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '',
+ di-location: '' }
+body: |
+ bb.0:
+ %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec
+ %1 = V_MOV_B32_e32 999, implicit %exec
+ %2, %vcc = V_ADD_I32_e64 %1, %0, implicit %exec
+ S_ENDPGM
diff --git a/test/CodeGen/AMDGPU/vop-shrink-non-ssa.mir b/test/CodeGen/AMDGPU/vop-shrink-non-ssa.mir
new file mode 100644
index 000000000000..b4c0c93347c2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vop-shrink-non-ssa.mir
@@ -0,0 +1,40 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-shrink-instructions -o - %s | FileCheck -check-prefix=GCN %s
+...
+# GCN-LABEL: name: fold_imm_non_ssa{{$}}
+# GCN: %0 = V_MOV_B32_e32 123, implicit %exec
+# GCN: %2 = V_ADD_I32_e32 456, %0, implicit-def %vcc, implicit %exec
+
+name: fold_imm_non_ssa
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: sreg_64 }
+body: |
+ bb.0:
+ %0 = COPY undef %0
+ %0 = V_MOV_B32_e32 123, implicit %exec
+ %1 = V_MOV_B32_e32 456, implicit %exec
+ %2, %vcc = V_ADD_I32_e64 %0, %1, implicit %exec
+ S_ENDPGM
+
+...
+# GCN-LABEL: name: fold_partially_defined_superreg{{$}}
+# GCN: %1 = V_MOV_B32_e32 456, implicit %exec
+# GCN: %2 = V_ADD_I32_e32 123, %1, implicit-def %vcc, implicit %exec
+name: fold_partially_defined_superreg
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: vreg_64 }
+body: |
+ bb.0:
+ undef %3.sub0 = V_MOV_B32_e32 123, implicit %exec, implicit-def %3
+ %1 = V_MOV_B32_e32 456, implicit %exec
+ %2, %vcc = V_ADD_I32_e64 %3.sub0, %1, implicit %exec
+ S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/vselect.ll b/test/CodeGen/AMDGPU/vselect.ll
index bb6234729f90..02ffd30be5fd 100644
--- a/test/CodeGen/AMDGPU/vselect.ll
+++ b/test/CodeGen/AMDGPU/vselect.ll
@@ -7,7 +7,9 @@
; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z
; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y
-; SI: v_cndmask_b32_e64
+; SI: v_cmp_gt_i32_e32 vcc
+; SI: v_cndmask_b32_e32
+; SI: v_cmp_gt_i32_e32 vcc
; SI: v_cndmask_b32_e32
define amdgpu_kernel void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) {
@@ -25,8 +27,11 @@ entry:
; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;SI: v_cndmask_b32_e64
-;SI: v_cndmask_b32_e32
+
+; SI: v_cmp_neq_f32_e32 vcc
+; SI: v_cndmask_b32_e32
+; SI: v_cmp_neq_f32_e32 vcc
+; SI: v_cndmask_b32_e32
define amdgpu_kernel void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
entry:
@@ -45,12 +50,10 @@ entry:
; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z
; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y
-; FIXME: The shrinking does not happen on tonga
-
-; SI: v_cndmask_b32
-; SI: v_cndmask_b32
-; SI: v_cndmask_b32
-; SI: v_cndmask_b32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
define amdgpu_kernel void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) {
entry:
@@ -68,6 +71,10 @@ entry:
;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
define amdgpu_kernel void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
entry:
%0 = load <4 x float>, <4 x float> addrspace(1)* %in0
diff --git a/test/CodeGen/AMDGPU/waitcnt-permute.mir b/test/CodeGen/AMDGPU/waitcnt-permute.mir
index 44dbd38f2d30..5612c7cac00b 100644
--- a/test/CodeGen/AMDGPU/waitcnt-permute.mir
+++ b/test/CodeGen/AMDGPU/waitcnt-permute.mir
@@ -1,18 +1,6 @@
# RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s
---- |
- define float @waitcnt-permute(i32 %x, i32 %y) {
- entry:
- %0 = call i32 @llvm.amdgcn.ds.bpermute(i32 %x, i32 %y)
- %1 = bitcast i32 %0 to float
- %2 = fadd float 1.000000e+00, %1
- ret float %2
- }
-
- declare i32 @llvm.amdgcn.ds.bpermute(i32, i32)
-
...
----
# CHECK-LABEL: name: waitcnt-permute{{$}}
# CHECK: DS_BPERMUTE_B32
# CHECK-NEXT: S_WAITCNT 127
diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll
index 57a082a0170c..847a1d739321 100644
--- a/test/CodeGen/AMDGPU/xor.ll
+++ b/test/CodeGen/AMDGPU/xor.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}xor_v2i32:
@@ -60,7 +60,7 @@ define amdgpu_kernel void @xor_i1(float addrspace(1)* %out, float addrspace(1)*
; FUNC-LABEL: {{^}}v_xor_i1:
; SI: buffer_load_ubyte [[B:v[0-9]+]]
; SI: buffer_load_ubyte [[A:v[0-9]+]]
-; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]]
+; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[B]], [[A]]
; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
; SI: buffer_store_byte [[RESULT]]
define amdgpu_kernel void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
diff --git a/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll b/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
index a902234898cd..69c42afb9ad5 100644
--- a/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
+++ b/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
@@ -6,7 +6,7 @@
; GCN-NOT: _or_
; GCN-NOT: v[[HI]]
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
-; GCN: v_or_b32_e32 v[[LO]], v[[LD32]], v[[LO]]
+; GCN: v_or_b32_e32 v[[LO]], v[[LO]], v[[LD32]]
; GCN-NOT: _or_
; GCN-NOT: v[[HI]]
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
@@ -26,7 +26,7 @@ define amdgpu_kernel void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrs
; GCN-NOT: _or_
; GCN-NOT: v[[HI]]
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
-; GCN: v_or_b32_e32 v[[LO]], v[[LD32]], v[[LO]]
+; GCN: v_or_b32_e32 v[[LO]], v[[LO]], v[[LD32]]
; GCN-NOT: v[[HI]]
; GCN-NOT: _or_
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
diff --git a/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll b/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll
index 9dcfe5007c00..ed5255bfbebd 100644
--- a/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll
+++ b/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll
@@ -6,23 +6,23 @@
; CHECK: ** List Scheduling
; CHECK: SU(2){{.*}}STR{{.*}}Volatile
-; CHECK-NOT: ord SU
-; CHECK: ord SU(3): Latency=1
-; CHECK-NOT: ord SU
+; CHECK-NOT: SU({{.*}}): Ord
+; CHECK: SU(3): Ord Latency=1
+; CHECK-NOT: SU({{.*}}): Ord
; CHECK: SU(3){{.*}}LDR{{.*}}Volatile
-; CHECK-NOT: ord SU
-; CHECK: ord SU(2): Latency=1
-; CHECK-NOT: ord SU
+; CHECK-NOT: SU({{.*}}): Ord
+; CHECK: SU(2): Ord Latency=1
+; CHECK-NOT: SU({{.*}}): Ord
; CHECK: Successors:
; CHECK: ** List Scheduling
; CHECK: SU(2){{.*}}STR{{.*}}
-; CHECK-NOT: ord SU
-; CHECK: ord SU(3): Latency=1
-; CHECK-NOT: ord SU
+; CHECK-NOT: SU({{.*}}): Ord
+; CHECK: SU(3): Ord Latency=1
+; CHECK-NOT: SU({{.*}}): Ord
; CHECK: SU(3){{.*}}LDR{{.*}}
-; CHECK-NOT: ord SU
-; CHECK: ord SU(2): Latency=1
-; CHECK-NOT: ord SU
+; CHECK-NOT: SU({{.*}}): Ord
+; CHECK: SU(2): Ord Latency=1
+; CHECK-NOT: SU({{.*}}): Ord
; CHECK: Successors:
define i32 @f1(i32* nocapture %p1, i32* nocapture %p2) nounwind {
entry:
diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select-cmp.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select-cmp.mir
index 111375ece51b..6c8bc7123a1a 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select-cmp.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select-cmp.mir
@@ -10,6 +10,46 @@
define void @test_icmp_sge_s32() { ret void }
define void @test_icmp_slt_s32() { ret void }
define void @test_icmp_sle_s32() { ret void }
+
+ define void @test_fcmp_true_s32() #0 { ret void }
+ define void @test_fcmp_false_s32() #0 { ret void }
+
+ define void @test_fcmp_oeq_s32() #0 { ret void }
+ define void @test_fcmp_ogt_s32() #0 { ret void }
+ define void @test_fcmp_oge_s32() #0 { ret void }
+ define void @test_fcmp_olt_s32() #0 { ret void }
+ define void @test_fcmp_ole_s32() #0 { ret void }
+ define void @test_fcmp_ord_s32() #0 { ret void }
+ define void @test_fcmp_ugt_s32() #0 { ret void }
+ define void @test_fcmp_uge_s32() #0 { ret void }
+ define void @test_fcmp_ult_s32() #0 { ret void }
+ define void @test_fcmp_ule_s32() #0 { ret void }
+ define void @test_fcmp_une_s32() #0 { ret void }
+ define void @test_fcmp_uno_s32() #0 { ret void }
+
+ define void @test_fcmp_one_s32() #0 { ret void }
+ define void @test_fcmp_ueq_s32() #0 { ret void }
+
+ define void @test_fcmp_true_s64() #0 { ret void }
+ define void @test_fcmp_false_s64() #0 { ret void }
+
+ define void @test_fcmp_oeq_s64() #0 { ret void }
+ define void @test_fcmp_ogt_s64() #0 { ret void }
+ define void @test_fcmp_oge_s64() #0 { ret void }
+ define void @test_fcmp_olt_s64() #0 { ret void }
+ define void @test_fcmp_ole_s64() #0 { ret void }
+ define void @test_fcmp_ord_s64() #0 { ret void }
+ define void @test_fcmp_ugt_s64() #0 { ret void }
+ define void @test_fcmp_uge_s64() #0 { ret void }
+ define void @test_fcmp_ult_s64() #0 { ret void }
+ define void @test_fcmp_ule_s64() #0 { ret void }
+ define void @test_fcmp_une_s64() #0 { ret void }
+ define void @test_fcmp_uno_s64() #0 { ret void }
+
+ define void @test_fcmp_one_s64() #0 { ret void }
+ define void @test_fcmp_ueq_s64() #0 { ret void }
+
+ attributes #0 = { "target-features"="+vfp2" }
...
---
name: test_icmp_eq_s32
@@ -35,8 +75,8 @@ body: |
%2(s1) = G_ICMP intpred(eq), %0(s32), %1
; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
- ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
- ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 0, %cpsr
+ ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 0, %cpsr
%3(s32) = G_ZEXT %2(s1)
; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
@@ -71,8 +111,8 @@ body: |
%2(s1) = G_ICMP intpred(ne), %0(s32), %1
; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
- ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
- ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 1, %cpsr
+ ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 1, %cpsr
%3(s32) = G_ZEXT %2(s1)
; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
@@ -107,8 +147,8 @@ body: |
%2(s1) = G_ICMP intpred(ugt), %0(s32), %1
; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
- ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
- ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 8, %cpsr
+ ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 8, %cpsr
%3(s32) = G_ZEXT %2(s1)
; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
@@ -143,8 +183,8 @@ body: |
%2(s1) = G_ICMP intpred(uge), %0(s32), %1
; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
- ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
- ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 2, %cpsr
+ ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 2, %cpsr
%3(s32) = G_ZEXT %2(s1)
; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
@@ -179,8 +219,8 @@ body: |
%2(s1) = G_ICMP intpred(ult), %0(s32), %1
; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
- ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
- ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 3, %cpsr
+ ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 3, %cpsr
%3(s32) = G_ZEXT %2(s1)
; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
@@ -215,8 +255,8 @@ body: |
%2(s1) = G_ICMP intpred(ule), %0(s32), %1
; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
- ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
- ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 9, %cpsr
+ ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 9, %cpsr
%3(s32) = G_ZEXT %2(s1)
; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
@@ -251,8 +291,8 @@ body: |
%2(s1) = G_ICMP intpred(sgt), %0(s32), %1
; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
- ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
- ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 12, %cpsr
+ ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 12, %cpsr
%3(s32) = G_ZEXT %2(s1)
; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
@@ -287,8 +327,8 @@ body: |
%2(s1) = G_ICMP intpred(sge), %0(s32), %1
; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
- ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
- ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 10, %cpsr
+ ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 10, %cpsr
%3(s32) = G_ZEXT %2(s1)
; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
@@ -323,8 +363,8 @@ body: |
%2(s1) = G_ICMP intpred(slt), %0(s32), %1
; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
- ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
- ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 11, %cpsr
+ ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 11, %cpsr
%3(s32) = G_ZEXT %2(s1)
; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
@@ -359,8 +399,1180 @@ body: |
%2(s1) = G_ICMP intpred(sle), %0(s32), %1
; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
- ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
- ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 13, %cpsr
+ ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 13, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_true_s32
+# CHECK-LABEL: name: test_fcmp_true_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ %1(s32) = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(true), %0(s32), %1
+ ; CHECK: [[RES:%[0-9]+]] = MOVi 1, 14, _, _
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_false_s32
+# CHECK-LABEL: name: test_fcmp_false_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ %1(s32) = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(false), %0(s32), %1
+ ; CHECK: [[RES:%[0-9]+]] = MOVi 0, 14, _, _
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_oeq_s32
+# CHECK-LABEL: name: test_fcmp_oeq_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(oeq), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 0, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ogt_s32
+# CHECK-LABEL: name: test_fcmp_ogt_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(ogt), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 12, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_oge_s32
+# CHECK-LABEL: name: test_fcmp_oge_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(oge), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 10, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_olt_s32
+# CHECK-LABEL: name: test_fcmp_olt_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(olt), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 4, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ole_s32
+# CHECK-LABEL: name: test_fcmp_ole_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(ole), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 9, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ord_s32
+# CHECK-LABEL: name: test_fcmp_ord_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(ord), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 7, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ugt_s32
+# CHECK-LABEL: name: test_fcmp_ugt_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(ugt), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 8, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_uge_s32
+# CHECK-LABEL: name: test_fcmp_uge_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(uge), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 5, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ult_s32
+# CHECK-LABEL: name: test_fcmp_ult_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(ult), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 11, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ule_s32
+# CHECK-LABEL: name: test_fcmp_ule_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(ule), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 13, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_une_s32
+# CHECK-LABEL: name: test_fcmp_une_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(une), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 1, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_uno_s32
+# CHECK-LABEL: name: test_fcmp_uno_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(uno), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 6, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_one_s32
+# CHECK-LABEL: name: test_fcmp_one_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(one), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES1:%[0-9]+]] = MOVCCi [[ZERO]], 1, 12, %cpsr
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[RES1]], 1, 4, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ueq_s32
+# CHECK-LABEL: name: test_fcmp_ueq_s32
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+ %1(s32) = COPY %s1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+ %2(s1) = G_FCMP floatpred(ueq), %0(s32), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES1:%[0-9]+]] = MOVCCi [[ZERO]], 1, 0, %cpsr
+ ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[RES1]], 1, 6, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_true_s64
+# CHECK-LABEL: name: test_fcmp_true_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ %1(s64) = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(true), %0(s64), %1
+ ; CHECK: [[RES:%[0-9]+]] = MOVi 1, 14, _, _
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_false_s64
+# CHECK-LABEL: name: test_fcmp_false_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ %1(s64) = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(false), %0(s64), %1
+ ; CHECK: [[RES:%[0-9]+]] = MOVi 0, 14, _, _
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_oeq_s64
+# CHECK-LABEL: name: test_fcmp_oeq_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(oeq), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 0, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ogt_s64
+# CHECK-LABEL: name: test_fcmp_ogt_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(ogt), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 12, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_oge_s64
+# CHECK-LABEL: name: test_fcmp_oge_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(oge), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 10, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_olt_s64
+# CHECK-LABEL: name: test_fcmp_olt_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(olt), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 4, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ole_s64
+# CHECK-LABEL: name: test_fcmp_ole_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(ole), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 9, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ord_s64
+# CHECK-LABEL: name: test_fcmp_ord_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(ord), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 7, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ugt_s64
+# CHECK-LABEL: name: test_fcmp_ugt_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(ugt), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 8, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_uge_s64
+# CHECK-LABEL: name: test_fcmp_uge_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(uge), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 5, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ult_s64
+# CHECK-LABEL: name: test_fcmp_ult_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(ult), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 11, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ule_s64
+# CHECK-LABEL: name: test_fcmp_ule_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(ule), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 13, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_une_s64
+# CHECK-LABEL: name: test_fcmp_une_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(une), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 1, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_uno_s64
+# CHECK-LABEL: name: test_fcmp_uno_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(uno), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 6, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_one_s64
+# CHECK-LABEL: name: test_fcmp_one_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(one), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES1:%[0-9]+]] = MOVCCi [[ZERO]], 1, 12, %cpsr
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[RES1]], 1, 4, %cpsr
+
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
+
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[RET]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ueq_s64
+# CHECK-LABEL: name: test_fcmp_ueq_s64
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: fprb }
+ - { id: 1, class: fprb }
+ - { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+ %1(s64) = COPY %d1
+ ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+ %2(s1) = G_FCMP floatpred(ueq), %0(s64), %1
+ ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES1:%[0-9]+]] = MOVCCi [[ZERO]], 1, 0, %cpsr
+ ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv
+ ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv
+ ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[RES1]], 1, 6, %cpsr
%3(s32) = G_ZEXT %2(s1)
; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _
diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll b/test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll
index 7d021fdb43dd..98b39e444ac7 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll
@@ -49,3 +49,33 @@ define arm_aapcscc double @test_add_double(double %x, double %y) {
%r = fadd double %x, %y
ret double %r
}
+
+define arm_aapcs_vfpcc i32 @test_cmp_float_ogt(float %x, float %y) {
+; CHECK-LABEL: test_cmp_float_ogt
+; HARD: vcmp.f32
+; HARD: vmrs APSR_nzcv, fpscr
+; HARD-NEXT: movgt
+; SOFT-AEABI: blx __aeabi_fcmpgt
+; SOFT-DEFAULT: blx __gtsf2
+entry:
+ %v = fcmp ogt float %x, %y
+ %r = zext i1 %v to i32
+ ret i32 %r
+}
+
+define arm_aapcs_vfpcc i32 @test_cmp_float_one(float %x, float %y) {
+; CHECK-LABEL: test_cmp_float_one
+; HARD: vcmp.f32
+; HARD: vmrs APSR_nzcv, fpscr
+; HARD: movgt
+; HARD-NOT: vcmp
+; HARD: movmi
+; SOFT-AEABI-DAG: blx __aeabi_fcmpgt
+; SOFT-AEABI-DAG: blx __aeabi_fcmplt
+; SOFT-DEFAULT-DAG: blx __gtsf2
+; SOFT-DEFAULT-DAG: blx __ltsf2
+entry:
+ %v = fcmp one float %x, %y
+ %r = zext i1 %v to i32
+ ret i32 %r
+}
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir
index c93e7fa0ec56..9a0877846fc3 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir
@@ -36,6 +36,7 @@ body: |
%0(s32) = COPY %r0
%1(s32) = COPY %r1
; HWDIV: [[R:%[0-9]+]](s32) = G_SDIV [[X]], [[Y]]
+ ; SOFT-NOT: G_SDIV
; SOFT: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X]]
; SOFT-DAG: %r1 = COPY [[Y]]
@@ -44,6 +45,7 @@ body: |
; SOFT-DEFAULT: BLX $__divsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
; SOFT-DEFAULT: [[R:%[0-9]+]](s32) = COPY %r0
; SOFT: ADJCALLSTACKUP
+ ; SOFT-NOT: G_SDIV
%2(s32) = G_SDIV %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s32)
@@ -70,6 +72,7 @@ body: |
%0(s32) = COPY %r0
%1(s32) = COPY %r1
; HWDIV: [[R:%[0-9]+]](s32) = G_UDIV [[X]], [[Y]]
+ ; SOFT-NOT: G_UDIV
; SOFT: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X]]
; SOFT-DAG: %r1 = COPY [[Y]]
@@ -78,6 +81,7 @@ body: |
; SOFT-DEFAULT: BLX $__udivsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
; SOFT-DEFAULT: [[R:%[0-9]+]](s32) = COPY %r0
; SOFT: ADJCALLSTACKUP
+ ; SOFT-NOT: G_UDIV
%2(s32) = G_UDIV %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s32)
@@ -106,6 +110,7 @@ body: |
%0(s16) = COPY %r0
%1(s16) = COPY %r1
; HWDIV: [[R32:%[0-9]+]](s32) = G_SDIV [[X32]], [[Y32]]
+ ; SOFT-NOT: G_SDIV
; SOFT: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X32]]
; SOFT-DAG: %r1 = COPY [[Y32]]
@@ -114,7 +119,9 @@ body: |
; SOFT-DEFAULT: BLX $__divsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0
; SOFT: ADJCALLSTACKUP
+ ; SOFT-NOT: G_SDIV
; CHECK: [[R:%[0-9]+]](s16) = G_TRUNC [[R32]]
+ ; SOFT-NOT: G_SDIV
%2(s16) = G_SDIV %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s16)
@@ -143,6 +150,7 @@ body: |
%0(s16) = COPY %r0
%1(s16) = COPY %r1
; HWDIV: [[R32:%[0-9]+]](s32) = G_UDIV [[X32]], [[Y32]]
+ ; SOFT-NOT: G_UDIV
; SOFT: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X32]]
; SOFT-DAG: %r1 = COPY [[Y32]]
@@ -151,7 +159,9 @@ body: |
; SOFT-DEFAULT: BLX $__udivsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0
; SOFT: ADJCALLSTACKUP
+ ; SOFT-NOT: G_UDIV
; CHECK: [[R:%[0-9]+]](s16) = G_TRUNC [[R32]]
+ ; SOFT-NOT: G_UDIV
%2(s16) = G_UDIV %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s16)
@@ -180,6 +190,7 @@ body: |
%0(s8) = COPY %r0
%1(s8) = COPY %r1
; HWDIV: [[R32:%[0-9]+]](s32) = G_SDIV [[X32]], [[Y32]]
+ ; SOFT-NOT: G_SDIV
; SOFT: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X32]]
; SOFT-DAG: %r1 = COPY [[Y32]]
@@ -188,7 +199,9 @@ body: |
; SOFT-DEFAULT: BLX $__divsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0
; SOFT: ADJCALLSTACKUP
+ ; SOFT-NOT: G_SDIV
; CHECK: [[R:%[0-9]+]](s8) = G_TRUNC [[R32]]
+ ; SOFT-NOT: G_SDIV
%2(s8) = G_SDIV %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s8)
@@ -217,6 +230,7 @@ body: |
%0(s8) = COPY %r0
%1(s8) = COPY %r1
; HWDIV: [[R32:%[0-9]+]](s32) = G_UDIV [[X32]], [[Y32]]
+ ; SOFT-NOT: G_UDIV
; SOFT: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X32]]
; SOFT-DAG: %r1 = COPY [[Y32]]
@@ -225,7 +239,9 @@ body: |
; SOFT-DEFAULT: BLX $__udivsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0
; SOFT: ADJCALLSTACKUP
+ ; SOFT-NOT: G_UDIV
; CHECK: [[R:%[0-9]+]](s8) = G_TRUNC [[R32]]
+ ; SOFT-NOT: G_UDIV
%2(s8) = G_UDIV %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s8)
@@ -254,6 +270,7 @@ body: |
; HWDIV: [[Q:%[0-9]+]](s32) = G_SDIV [[X]], [[Y]]
; HWDIV: [[P:%[0-9]+]](s32) = G_MUL [[Q]], [[Y]]
; HWDIV: [[R:%[0-9]+]](s32) = G_SUB [[X]], [[P]]
+ ; SOFT-NOT: G_SREM
; SOFT: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X]]
; SOFT-DAG: %r1 = COPY [[Y]]
@@ -262,6 +279,7 @@ body: |
; SOFT-DEFAULT: BLX $__modsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
; SOFT-DEFAULT: [[R:%[0-9]+]](s32) = COPY %r0
; SOFT: ADJCALLSTACKUP
+ ; SOFT-NOT: G_SREM
%2(s32) = G_SREM %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s32)
@@ -290,6 +308,7 @@ body: |
; HWDIV: [[Q:%[0-9]+]](s32) = G_UDIV [[X]], [[Y]]
; HWDIV: [[P:%[0-9]+]](s32) = G_MUL [[Q]], [[Y]]
; HWDIV: [[R:%[0-9]+]](s32) = G_SUB [[X]], [[P]]
+ ; SOFT-NOT: G_UREM
; SOFT: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X]]
; SOFT-DAG: %r1 = COPY [[Y]]
@@ -298,6 +317,7 @@ body: |
; SOFT-DEFAULT: BLX $__umodsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
; SOFT-DEFAULT: [[R:%[0-9]+]](s32) = COPY %r0
; SOFT: ADJCALLSTACKUP
+ ; SOFT-NOT: G_UREM
%2(s32) = G_UREM %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s32)
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir
index 803135ba595e..cb61f95b10ce 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir
@@ -10,6 +10,44 @@
define void @test_fadd_float() { ret void }
define void @test_fadd_double() { ret void }
+
+ define void @test_fcmp_true_s32() { ret void }
+ define void @test_fcmp_false_s32() { ret void }
+
+ define void @test_fcmp_oeq_s32() { ret void }
+ define void @test_fcmp_ogt_s32() { ret void }
+ define void @test_fcmp_oge_s32() { ret void }
+ define void @test_fcmp_olt_s32() { ret void }
+ define void @test_fcmp_ole_s32() { ret void }
+ define void @test_fcmp_ord_s32() { ret void }
+ define void @test_fcmp_ugt_s32() { ret void }
+ define void @test_fcmp_uge_s32() { ret void }
+ define void @test_fcmp_ult_s32() { ret void }
+ define void @test_fcmp_ule_s32() { ret void }
+ define void @test_fcmp_une_s32() { ret void }
+ define void @test_fcmp_uno_s32() { ret void }
+
+ define void @test_fcmp_one_s32() { ret void }
+ define void @test_fcmp_ueq_s32() { ret void }
+
+ define void @test_fcmp_true_s64() { ret void }
+ define void @test_fcmp_false_s64() { ret void }
+
+ define void @test_fcmp_oeq_s64() { ret void }
+ define void @test_fcmp_ogt_s64() { ret void }
+ define void @test_fcmp_oge_s64() { ret void }
+ define void @test_fcmp_olt_s64() { ret void }
+ define void @test_fcmp_ole_s64() { ret void }
+ define void @test_fcmp_ord_s64() { ret void }
+ define void @test_fcmp_ugt_s64() { ret void }
+ define void @test_fcmp_uge_s64() { ret void }
+ define void @test_fcmp_ult_s64() { ret void }
+ define void @test_fcmp_ule_s64() { ret void }
+ define void @test_fcmp_une_s64() { ret void }
+ define void @test_fcmp_uno_s64() { ret void }
+
+ define void @test_fcmp_one_s64() { ret void }
+ define void @test_fcmp_ueq_s64() { ret void }
...
---
name: test_frem_float
@@ -31,6 +69,7 @@ body: |
; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
%0(s32) = COPY %r0
%1(s32) = COPY %r1
+ ; CHECK-NOT: G_FREM
; CHECK: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X]]
; SOFT-DAG: %r1 = COPY [[Y]]
@@ -41,6 +80,7 @@ body: |
; SOFT: [[R:%[0-9]+]](s32) = COPY %r0
; HARD: [[R:%[0-9]+]](s32) = COPY %s0
; CHECK: ADJCALLSTACKUP
+ ; CHECK-NOT: G_FREM
%2(s32) = G_FREM %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s32)
@@ -86,6 +126,7 @@ body: |
; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]]
%4(s64) = G_MERGE_VALUES %0(s32), %1(s32)
%5(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+ ; CHECK-NOT: G_FREM
; CHECK: ADJCALLSTACKDOWN
; SOFT-DAG: %r{{[0-1]}} = COPY [[X0]]
; SOFT-DAG: %r{{[0-1]}} = COPY [[X1]]
@@ -96,6 +137,7 @@ body: |
; SOFT: BLX $fmod, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
; HARD: BLX $fmod, {{.*}}, implicit %d0, implicit %d1, implicit-def %d0
; CHECK: ADJCALLSTACKUP
+ ; CHECK-NOT: G_FREM
%6(s64) = G_FREM %4, %5
%7(s32), %8(s32) = G_UNMERGE_VALUES %6(s64)
%r0 = COPY %7(s32)
@@ -122,6 +164,7 @@ body: |
; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
%0(s32) = COPY %r0
%1(s32) = COPY %r1
+ ; CHECK-NOT: G_FPOW
; CHECK: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X]]
; SOFT-DAG: %r1 = COPY [[Y]]
@@ -132,6 +175,7 @@ body: |
; SOFT: [[R:%[0-9]+]](s32) = COPY %r0
; HARD: [[R:%[0-9]+]](s32) = COPY %s0
; CHECK: ADJCALLSTACKUP
+ ; CHECK-NOT: G_FPOW
%2(s32) = G_FPOW %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s32)
@@ -177,6 +221,7 @@ body: |
; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]]
%4(s64) = G_MERGE_VALUES %0(s32), %1(s32)
%5(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+ ; CHECK-NOT: G_FPOW
; CHECK: ADJCALLSTACKDOWN
; SOFT-DAG: %r{{[0-1]}} = COPY [[X0]]
; SOFT-DAG: %r{{[0-1]}} = COPY [[X1]]
@@ -187,6 +232,7 @@ body: |
; SOFT: BLX $pow, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
; HARD: BLX $pow, {{.*}}, implicit %d0, implicit %d1, implicit-def %d0
; CHECK: ADJCALLSTACKUP
+ ; CHECK-NOT: G_FPOW
%6(s64) = G_FPOW %4, %5
%7(s32), %8(s32) = G_UNMERGE_VALUES %6(s64)
%r0 = COPY %7(s32)
@@ -214,6 +260,7 @@ body: |
%0(s32) = COPY %r0
%1(s32) = COPY %r1
; HARD: [[R:%[0-9]+]](s32) = G_FADD [[X]], [[Y]]
+ ; SOFT-NOT: G_FADD
; SOFT: ADJCALLSTACKDOWN
; SOFT-DAG: %r0 = COPY [[X]]
; SOFT-DAG: %r1 = COPY [[Y]]
@@ -221,6 +268,7 @@ body: |
; SOFT-DEFAULT: BLX $__addsf3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
; SOFT: [[R:%[0-9]+]](s32) = COPY %r0
; SOFT: ADJCALLSTACKUP
+ ; SOFT-NOT: G_FADD
%2(s32) = G_FADD %0, %1
; CHECK: %r0 = COPY [[R]]
%r0 = COPY %2(s32)
@@ -261,6 +309,7 @@ body: |
%4(s64) = G_MERGE_VALUES %0(s32), %1(s32)
%5(s64) = G_MERGE_VALUES %2(s32), %3(s32)
; HARD: [[R:%[0-9]+]](s64) = G_FADD [[X]], [[Y]]
+ ; SOFT-NOT: G_FADD
; SOFT: ADJCALLSTACKDOWN
; SOFT-DAG: %r{{[0-1]}} = COPY [[X0]]
; SOFT-DAG: %r{{[0-1]}} = COPY [[X1]]
@@ -269,6 +318,7 @@ body: |
; SOFT-AEABI: BLX $__aeabi_dadd, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
; SOFT-DEFAULT: BLX $__adddf3, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
; SOFT: ADJCALLSTACKUP
+ ; SOFT-NOT: G_FADD
%6(s64) = G_FADD %4, %5
; HARD-DAG: G_UNMERGE_VALUES [[R]](s64)
%7(s32),%8(s32) = G_UNMERGE_VALUES %6(s64)
@@ -276,3 +326,1565 @@ body: |
%r1 = COPY %8(s32)
BX_RET 14, _, implicit %r0, implicit %r1
...
+---
+name: test_fcmp_true_s32
+# CHECK-LABEL: name: test_fcmp_true_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(true), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(true), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: [[REXT:%[0-9]+]](s32) = G_CONSTANT i32 -1
+ ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]](s32)
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_false_s32
+# CHECK-LABEL: name: test_fcmp_false_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(false), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(false), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: [[REXT:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]](s32)
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_oeq_s32
+# CHECK-LABEL: name: test_fcmp_oeq_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(oeq), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(oeq), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpeq, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__eqsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ogt_s32
+# CHECK-LABEL: name: test_fcmp_ogt_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(ogt), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ogt), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpgt, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__gtsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sgt), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_oge_s32
+# CHECK-LABEL: name: test_fcmp_oge_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(oge), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(oge), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpge, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__gesf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sge), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_olt_s32
+# CHECK-LABEL: name: test_fcmp_olt_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(olt), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(olt), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmplt, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__ltsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(slt), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ole_s32
+# CHECK-LABEL: name: test_fcmp_ole_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(ole), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ole), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmple, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__lesf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sle), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ord_s32
+# CHECK-LABEL: name: test_fcmp_ord_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(ord), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ord), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpun, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__unordsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ugt_s32
+# CHECK-LABEL: name: test_fcmp_ugt_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(ugt), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ugt), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmple, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__lesf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sgt), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_uge_s32
+# CHECK-LABEL: name: test_fcmp_uge_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(uge), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(uge), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmplt, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__ltsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sge), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ult_s32
+# CHECK-LABEL: name: test_fcmp_ult_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(ult), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ult), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpge, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__gesf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(slt), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ule_s32
+# CHECK-LABEL: name: test_fcmp_ule_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(ule), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ule), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpgt, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__gtsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sle), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_une_s32
+# CHECK-LABEL: name: test_fcmp_une_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(une), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(une), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpeq, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__nesf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(ne), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_uno_s32
+# CHECK-LABEL: name: test_fcmp_uno_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(uno), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(uno), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpun, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__unordsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(ne), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_one_s32
+# CHECK-LABEL: name: test_fcmp_one_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(one), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(one), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpgt, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__gtsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET1:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R1:%[0-9]+]](s1) = G_TRUNC [[RET1]]
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R1:%[0-9]+]](s1) = G_ICMP intpred(sgt), [[RET1]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmplt, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__ltsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET2:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R2:%[0-9]+]](s1) = G_TRUNC [[RET2]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R2:%[0-9]+]](s1) = G_ICMP intpred(slt), [[RET2]](s32), [[ZERO]]
+ ; SOFT-DAG: [[R1EXT:%[0-9]+]](s32) = G_ANYEXT [[R1]]
+ ; SOFT-DAG: [[R2EXT:%[0-9]+]](s32) = G_ANYEXT [[R2]]
+ ; SOFT: [[REXT:%[0-9]+]](s32) = G_OR [[R1EXT]], [[R2EXT]]
+ ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ueq_s32
+# CHECK-LABEL: name: test_fcmp_ueq_s32
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+ %2(s1) = G_FCMP floatpred(ueq), %0(s32), %1
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ueq), [[X]](s32), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpeq, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__eqsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET1:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R1:%[0-9]+]](s1) = G_TRUNC [[RET1]]
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R1:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET1]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X]]
+ ; SOFT-DAG: %r1 = COPY [[Y]]
+ ; SOFT-AEABI: BLX $__aeabi_fcmpun, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__unordsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+ ; SOFT: [[RET2:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R2:%[0-9]+]](s1) = G_TRUNC [[RET2]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R2:%[0-9]+]](s1) = G_ICMP intpred(ne), [[RET2]](s32), [[ZERO]]
+ ; SOFT-DAG: [[R1EXT:%[0-9]+]](s32) = G_ANYEXT [[R1]]
+ ; SOFT-DAG: [[R2EXT:%[0-9]+]](s32) = G_ANYEXT [[R2]]
+ ; SOFT: [[REXT:%[0-9]+]](s32) = G_OR [[R1EXT]], [[R2EXT]]
+ ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]]
+ ; SOFT-NOT: G_FCMP
+ %3(s32) = G_ZEXT %2(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %3(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_true_s64
+# CHECK-LABEL: name: test_fcmp_true_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(true), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(true), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: [[REXT:%[0-9]+]](s32) = G_CONSTANT i32 -1
+ ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]](s32)
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_false_s64
+# CHECK-LABEL: name: test_fcmp_false_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(false), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(false), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: [[REXT:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]](s32)
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_oeq_s64
+# CHECK-LABEL: name: test_fcmp_oeq_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(oeq), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(oeq), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpeq, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__eqdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ogt_s64
+# CHECK-LABEL: name: test_fcmp_ogt_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(ogt), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ogt), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpgt, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__gtdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sgt), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_oge_s64
+# CHECK-LABEL: name: test_fcmp_oge_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(oge), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(oge), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpge, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__gedf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sge), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_olt_s64
+# CHECK-LABEL: name: test_fcmp_olt_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(olt), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(olt), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmplt, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__ltdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(slt), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ole_s64
+# CHECK-LABEL: name: test_fcmp_ole_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(ole), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ole), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmple, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__ledf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sle), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ord_s64
+# CHECK-LABEL: name: test_fcmp_ord_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(ord), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ord), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpun, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__unorddf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ugt_s64
+# CHECK-LABEL: name: test_fcmp_ugt_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(ugt), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ugt), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmple, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__ledf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sgt), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_uge_s64
+# CHECK-LABEL: name: test_fcmp_uge_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(uge), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(uge), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmplt, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__ltdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sge), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ult_s64
+# CHECK-LABEL: name: test_fcmp_ult_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(ult), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ult), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpge, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__gedf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(slt), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ule_s64
+# CHECK-LABEL: name: test_fcmp_ule_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(ule), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ule), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpgt, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__gtdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sle), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_une_s64
+# CHECK-LABEL: name: test_fcmp_une_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(une), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(une), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpeq, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__nedf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]]
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(ne), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_uno_s64
+# CHECK-LABEL: name: test_fcmp_uno_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(uno), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(uno), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpun, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__unorddf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(ne), [[RET]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_one_s64
+# CHECK-LABEL: name: test_fcmp_one_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(one), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(one), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpgt, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__gtdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET1:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R1:%[0-9]+]](s1) = G_TRUNC [[RET1]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R1:%[0-9]+]](s1) = G_ICMP intpred(sgt), [[RET1]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmplt, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__ltdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET2:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R2:%[0-9]+]](s1) = G_TRUNC [[RET2]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R2:%[0-9]+]](s1) = G_ICMP intpred(slt), [[RET2]](s32), [[ZERO]]
+ ; SOFT-DAG: [[R1EXT:%[0-9]+]](s32) = G_ANYEXT [[R1]]
+ ; SOFT-DAG: [[R2EXT:%[0-9]+]](s32) = G_ANYEXT [[R2]]
+ ; SOFT: [[REXT:%[0-9]+]](s32) = G_OR [[R1EXT]], [[R2EXT]]
+ ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_fcmp_ueq_s64
+# CHECK-LABEL: name: test_fcmp_ueq_s64
+legalized: false
+# CHECK: legalized: true
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
+ - { id: 6, class: _ }
+ - { id: 7, class: _ }
+body: |
+ bb.0:
+ liveins: %r0, %r1, %r2, %r3
+
+ %0(s32) = COPY %r0
+ %1(s32) = COPY %r1
+ %2(s32) = COPY %r2
+ %3(s32) = COPY %r3
+ ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+ ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+ ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+ ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+ %4(s64) = G_MERGE_VALUES %0(s32), %1
+ %5(s64) = G_MERGE_VALUES %2(s32), %3
+ ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+ ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32)
+ %6(s1) = G_FCMP floatpred(ueq), %4(s64), %5
+ ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ueq), [[X]](s64), [[Y]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpeq, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__eqdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET1:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R1:%[0-9]+]](s1) = G_TRUNC [[RET1]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R1:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET1]](s32), [[ZERO]]
+ ; SOFT-NOT: G_FCMP
+ ; SOFT: ADJCALLSTACKDOWN
+ ; SOFT-DAG: %r0 = COPY [[X0]]
+ ; SOFT-DAG: %r1 = COPY [[X1]]
+ ; SOFT-DAG: %r2 = COPY [[Y0]]
+ ; SOFT-DAG: %r3 = COPY [[Y1]]
+ ; SOFT-AEABI: BLX $__aeabi_dcmpun, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT-DEFAULT: BLX $__unorddf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+ ; SOFT: [[RET2:%[0-9]+]](s32) = COPY %r0
+ ; SOFT: ADJCALLSTACKUP
+ ; SOFT-AEABI: [[R2:%[0-9]+]](s1) = G_TRUNC [[RET2]](s32)
+ ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+ ; SOFT-DEFAULT: [[R2:%[0-9]+]](s1) = G_ICMP intpred(ne), [[RET2]](s32), [[ZERO]]
+ ; SOFT-DAG: [[R1EXT:%[0-9]+]](s32) = G_ANYEXT [[R1]]
+ ; SOFT-DAG: [[R2EXT:%[0-9]+]](s32) = G_ANYEXT [[R2]]
+ ; SOFT: [[REXT:%[0-9]+]](s32) = G_OR [[R1EXT]], [[R2EXT]]
+ ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]]
+ ; SOFT-NOT: G_FCMP
+ %7(s32) = G_ZEXT %6(s1)
+ ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1)
+ %r0 = COPY %7(s32)
+ ; CHECK: %r0 = COPY [[REXT]]
+ BX_RET 14, _, implicit %r0
+...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
index bf759728c365..4575341dfc29 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
@@ -111,6 +111,7 @@ body: |
%1(s8) = COPY %r1
%2(s8) = G_ADD %0, %1
; G_ADD with s8 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s8) = G_ADD {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_ADD {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s8) = G_ADD {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s8)
@@ -136,6 +137,7 @@ body: |
%1(s16) = COPY %r1
%2(s16) = G_ADD %0, %1
; G_ADD with s16 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s16) = G_ADD {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_ADD {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s16) = G_ADD {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s16)
@@ -187,6 +189,7 @@ body: |
%1(s8) = COPY %r1
%2(s8) = G_SUB %0, %1
; G_SUB with s8 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s8) = G_SUB {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_SUB {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s8) = G_SUB {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s8)
@@ -212,6 +215,7 @@ body: |
%1(s16) = COPY %r1
%2(s16) = G_SUB %0, %1
; G_SUB with s16 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s16) = G_SUB {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_SUB {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s16) = G_SUB {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s16)
@@ -263,6 +267,7 @@ body: |
%1(s8) = COPY %r1
%2(s8) = G_MUL %0, %1
; G_MUL with s8 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s8) = G_MUL {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_MUL {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s8) = G_MUL {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s8)
@@ -288,6 +293,7 @@ body: |
%1(s16) = COPY %r1
%2(s16) = G_MUL %0, %1
; G_MUL with s16 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s16) = G_MUL {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_MUL {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s16) = G_MUL {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s16)
@@ -339,6 +345,7 @@ body: |
%1(s8) = COPY %r1
%2(s8) = G_AND %0, %1
; G_AND with s8 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s8) = G_AND {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_AND {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s8) = G_AND {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s8)
@@ -364,6 +371,7 @@ body: |
%1(s16) = COPY %r1
%2(s16) = G_AND %0, %1
; G_AND with s16 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s16) = G_AND {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_AND {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s16) = G_AND {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s16)
@@ -415,6 +423,7 @@ body: |
%1(s8) = COPY %r1
%2(s8) = G_OR %0, %1
; G_OR with s8 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s8) = G_OR {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_OR {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s8) = G_OR {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s8)
@@ -440,6 +449,7 @@ body: |
%1(s16) = COPY %r1
%2(s16) = G_OR %0, %1
; G_OR with s16 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s16) = G_OR {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_OR {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s16) = G_OR {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s16)
@@ -491,6 +501,7 @@ body: |
%1(s8) = COPY %r1
%2(s8) = G_XOR %0, %1
; G_XOR with s8 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s8) = G_XOR {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_XOR {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s8) = G_XOR {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s8)
@@ -516,6 +527,7 @@ body: |
%1(s16) = COPY %r1
%2(s16) = G_XOR %0, %1
; G_XOR with s16 should widen
+ ; CHECK-NOT: {{%[0-9]+}}(s16) = G_XOR {{%[0-9]+, %[0-9]+}}
; CHECK: {{%[0-9]+}}(s32) = G_XOR {{%[0-9]+, %[0-9]+}}
; CHECK-NOT: {{%[0-9]+}}(s16) = G_XOR {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s16)
@@ -689,11 +701,32 @@ selected: false
tracksRegLiveness: true
registers:
- { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
body: |
bb.0:
%0(s32) = G_CONSTANT 42
; CHECK: {{%[0-9]+}}(s32) = G_CONSTANT 42
+ %1(s16) = G_CONSTANT i16 21
+ ; CHECK-NOT: G_CONSTANT i16
+ ; CHECK: [[EXT:%[0-9]+]](s32) = G_CONSTANT i32 21
+ ; CHECK: {{%[0-9]+}}(s16) = G_TRUNC [[EXT]](s32)
+ ; CHECK-NOT: G_CONSTANT i16
+
+ %2(s8) = G_CONSTANT i8 10
+ ; CHECK-NOT: G_CONSTANT i8
+ ; CHECK: [[EXT:%[0-9]+]](s32) = G_CONSTANT i32 10
+ ; CHECK: {{%[0-9]+}}(s8) = G_TRUNC [[EXT]](s32)
+ ; CHECK-NOT: G_CONSTANT i8
+
+ %3(s1) = G_CONSTANT i1 1
+ ; CHECK-NOT: G_CONSTANT i1
+ ; CHECK: [[EXT:%[0-9]+]](s32) = G_CONSTANT i32 -1
+ ; CHECK: {{%[0-9]+}}(s1) = G_TRUNC [[EXT]](s32)
+ ; CHECK-NOT: G_CONSTANT i1
+
%r0 = COPY %0(s32)
BX_RET 14, _, implicit %r0
...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
index d3b93e488ef4..ffca431d96ea 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
@@ -35,6 +35,8 @@
define void @test_trunc_s32_16() { ret void }
define void @test_icmp_eq_s32() { ret void }
+ define void @test_fcmp_one_s32() #0 { ret void }
+ define void @test_fcmp_ugt_s64() #0 { ret void }
define void @test_select_s32() { ret void }
@@ -743,6 +745,62 @@ body: |
...
---
+name: test_fcmp_one_s32
+# CHECK-LABEL: name: test_fcmp_one_s32
+legalized: true
+regBankSelected: false
+selected: false
+# CHECK: registers:
+# CHECK: - { id: 0, class: fprb, preferred-register: '' }
+# CHECK: - { id: 1, class: fprb, preferred-register: '' }
+# CHECK: - { id: 2, class: gprb, preferred-register: '' }
+
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %s0, %s1
+
+ %0(s32) = COPY %s0
+ %1(s32) = COPY %s1
+ %2(s1) = G_FCMP floatpred(one), %0(s32), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %r0 = COPY %3(s32)
+ BX_RET 14, _, implicit %r0
+
+...
+---
+name: test_fcmp_ugt_s64
+# CHECK-LABEL: name: test_fcmp_ugt_s64
+legalized: true
+regBankSelected: false
+selected: false
+# CHECK: registers:
+# CHECK: - { id: 0, class: fprb, preferred-register: '' }
+# CHECK: - { id: 1, class: fprb, preferred-register: '' }
+# CHECK: - { id: 2, class: gprb, preferred-register: '' }
+
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+body: |
+ bb.0:
+ liveins: %d0, %d1
+
+ %0(s64) = COPY %d0
+ %1(s64) = COPY %d1
+ %2(s1) = G_FCMP floatpred(ugt), %0(s64), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %r0 = COPY %3(s32)
+ BX_RET 14, _, implicit %r0
+
+...
+---
name: test_select_s32
# CHECK-LABEL: name: test_select_s32
legalized: true
diff --git a/test/CodeGen/ARM/arguments-nosplit-double.ll b/test/CodeGen/ARM/arguments-nosplit-double.ll
index 8e4dee45ddf2..bb3710842d34 100644
--- a/test/CodeGen/ARM/arguments-nosplit-double.ll
+++ b/test/CodeGen/ARM/arguments-nosplit-double.ll
@@ -8,5 +8,6 @@ define i32 @f(i64 %z, i32 %a, double %b) {
ret i32 %tmp
}
+; CHECK-LABEL: f:
; CHECK-NOT: r3
diff --git a/test/CodeGen/ARM/arguments-nosplit-i64.ll b/test/CodeGen/ARM/arguments-nosplit-i64.ll
index 4a08d0a0406a..02bdc6cc227a 100644
--- a/test/CodeGen/ARM/arguments-nosplit-i64.ll
+++ b/test/CodeGen/ARM/arguments-nosplit-i64.ll
@@ -8,5 +8,6 @@ define i32 @f(i64 %z, i32 %a, i64 %b) {
ret i32 %tmp
}
+; CHECK-LABEL: f:
; CHECK-NOT: r3
diff --git a/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll b/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll
index d54848a6bcf1..0ae2d5f6f2f2 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll
@@ -13,13 +13,13 @@
; CHECK: rdefs left
; CHECK-NEXT: Latency : 4
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; CHECK-SAME: Latency=1
-; CHECK-NEXT: data
+; CHECK-NEXT: Data
; CHECK-SAME: Latency=3
-; CHECK-NEXT: data
+; CHECK-NEXT: Data
; CHECK-SAME: Latency=3
-; CHECK-NEXT: data
+; CHECK-NEXT: Data
; CHECK-SAME: Latency=4
define i32 @bar(i32 %a1, i32 %b1, i32 %c1) minsize optsize {
%1 = load i32, i32* @a, align 4
diff --git a/test/CodeGen/ARM/cortex-a57-misched-ldm.ll b/test/CodeGen/ARM/cortex-a57-misched-ldm.ll
index 9cb076651f5b..bc7a14b1028e 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-ldm.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-ldm.ll
@@ -8,9 +8,9 @@
; CHECK: rdefs left
; CHECK-NEXT: Latency : 3
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; CHECK-SAME: Latency=3
-; CHECK-NEXT: data
+; CHECK-NEXT: Data
; CHECK-SAME: Latency=3
define i32 @foo(i32* %a) nounwind optsize {
diff --git a/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll b/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll
index 774b0a907e39..67cddc14d047 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll
@@ -10,7 +10,7 @@
; CHECK: rdefs left
; CHECK-NEXT: Latency : 2
; CHECK: Successors
-; CHECK: data
+; CHECK: Data
; CHECK-SAME: Latency=1
define i32 @bar(i32 %v0, i32 %v1, i32 %v2, i32* %addr) {
diff --git a/test/CodeGen/ARM/cortex-a57-misched-vfma.ll b/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
index e234e179ed07..372b2e2f5dc9 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
@@ -11,7 +11,7 @@ define float @Test1(float %f1, float %f2, float %f3, float %f4, float %f5, float
; > VMULS common latency = 5
; CHECK: Latency : 5
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMULS read-advanced latency to VMLAS = 0
; CHECK-SAME: Latency=0
@@ -20,7 +20,7 @@ define float @Test1(float %f1, float %f2, float %f3, float %f4, float %f5, float
; > VMLAS common latency = 9
; CHECK: Latency : 9
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMLAS read-advanced latency to the next VMLAS = 4
; CHECK-SAME: Latency=4
@@ -28,7 +28,7 @@ define float @Test1(float %f1, float %f2, float %f3, float %f4, float %f5, float
; CHECK-FAST: VFMAS
; CHECK: Latency : 9
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMLAS not-optimized latency to VMOVRS = 9
; CHECK-SAME: Latency=9
@@ -50,7 +50,7 @@ define <2 x float> @Test2(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2
; > VMULfd common latency = 5
; CHECK: Latency : 5
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; VMULfd read-advanced latency to VMLAfd = 0
; CHECK-SAME: Latency=0
@@ -59,7 +59,7 @@ define <2 x float> @Test2(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2
; > VMLAfd common latency = 9
; CHECK: Latency : 9
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMLAfd read-advanced latency to the next VMLAfd = 4
; CHECK-SAME: Latency=4
@@ -67,7 +67,7 @@ define <2 x float> @Test2(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2
; CHECK-FAST: VFMAfd
; CHECK: Latency : 9
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMLAfd not-optimized latency to VMOVRRD = 9
; CHECK-SAME: Latency=9
@@ -88,7 +88,7 @@ define float @Test3(float %f1, float %f2, float %f3, float %f4, float %f5, float
; > VMULS common latency = 5
; CHECK: Latency : 5
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMULS read-advanced latency to VMLSS = 0
; CHECK-SAME: Latency=0
@@ -97,7 +97,7 @@ define float @Test3(float %f1, float %f2, float %f3, float %f4, float %f5, float
; > VMLSS common latency = 9
; CHECK: Latency : 9
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMLSS read-advanced latency to the next VMLSS = 4
; CHECK-SAME: Latency=4
@@ -105,7 +105,7 @@ define float @Test3(float %f1, float %f2, float %f3, float %f4, float %f5, float
; CHECK-FAST: VFMSS
; CHECK: Latency : 9
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMLSS not-optimized latency to VMOVRS = 9
; CHECK-SAME: Latency=9
@@ -127,7 +127,7 @@ define <2 x float> @Test4(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2
; > VMULfd common latency = 5
; CHECK: Latency : 5
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; VMULfd read-advanced latency to VMLSfd = 0
; CHECK-SAME: Latency=0
@@ -136,7 +136,7 @@ define <2 x float> @Test4(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2
; > VMLSfd common latency = 9
; CHECK: Latency : 9
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMLSfd read-advanced latency to the next VMLSfd = 4
; CHECK-SAME: Latency=4
@@ -144,7 +144,7 @@ define <2 x float> @Test4(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2
; CHECK-FAST: VFMSfd
; CHECK: Latency : 9
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMLSfd not-optimized latency to VMOVRRD = 9
; CHECK-SAME: Latency=9
@@ -165,7 +165,7 @@ define float @Test5(float %f1, float %f2, float %f3) {
; CHECK-FAST: VFNMS
; CHECK: Latency : 9
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMLAS not-optimized latency to VMOVRS = 9
; CHECK-SAME: Latency=9
@@ -184,7 +184,7 @@ define float @Test6(float %f1, float %f2, float %f3) {
; CHECK-FAST: VFNMA
; CHECK: Latency : 9
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; > VMLAS not-optimized latency to VMOVRS = 9
; CHECK-SAME: Latency=9
diff --git a/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll b/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll
index 6cfa823fb969..b5edcc304229 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll
@@ -13,15 +13,15 @@
; CHECK: rdefs left
; CHECK-NEXT: Latency : 6
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; CHECK-SAME: Latency=1
-; CHECK-NEXT: data
+; CHECK-NEXT: Data
; CHECK-SAME: Latency=1
-; CHECK-NEXT: data
+; CHECK-NEXT: Data
; CHECK-SAME: Latency=5
-; CHECK-NEXT: data
+; CHECK-NEXT: Data
; CHECK-SAME: Latency=5
-; CHECK-NEXT: data
+; CHECK-NEXT: Data
; CHECK-SAME: Latency=6
define i32 @bar(i32* %iptr) minsize optsize {
%1 = load double, double* @a, align 8
diff --git a/test/CodeGen/ARM/cortex-a57-misched-vldm.ll b/test/CodeGen/ARM/cortex-a57-misched-vldm.ll
index 218b5b41a7e4..12c7b3270c3b 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-vldm.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-vldm.ll
@@ -8,11 +8,11 @@
; CHECK: rdefs left
; CHECK-NEXT: Latency : 6
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; CHECK-SAME: Latency=5
-; CHECK-NEXT: data
+; CHECK-NEXT: Data
; CHECK-SAME: Latency=5
-; CHECK-NEXT: data
+; CHECK-NEXT: Data
; CHECK-SAME: Latency=6
define double @foo(double* %a) nounwind optsize {
diff --git a/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll b/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll
index af1c469d4443..05c498eee49f 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll
@@ -9,7 +9,7 @@
; CHECK: rdefs left
; CHECK-NEXT: Latency : 4
; CHECK: Successors:
-; CHECK: data
+; CHECK: Data
; CHECK-SAME: Latency=1
@a = global double 0.0, align 4
diff --git a/test/CodeGen/ARM/fence-singlethread.ll b/test/CodeGen/ARM/fence-singlethread.ll
index ec032ccac423..536b6cc7c9d0 100644
--- a/test/CodeGen/ARM/fence-singlethread.ll
+++ b/test/CodeGen/ARM/fence-singlethread.ll
@@ -11,6 +11,6 @@ define void @fence_singlethread() {
; CHECK: @ COMPILER BARRIER
; CHECK-NOT: dmb
- fence singlethread seq_cst
+ fence syncscope("singlethread") seq_cst
ret void
}
diff --git a/test/CodeGen/ARM/ror.ll b/test/CodeGen/ARM/ror.ll
new file mode 100644
index 000000000000..0f699a8dd29d
--- /dev/null
+++ b/test/CodeGen/ARM/ror.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=arm-eabi -mattr=+v6 %s -o - | FileCheck %s
+
+; rotr (rotr x, 4), 6 -> rotr x, 10 -> ror r0, r0, #10
+define i32 @test1(i32 %x) nounwind readnone {
+; CHECK-LABEL: test1:
+; CHECK: ror r0, r0, #10
+; CHECK: bx lr
+entry:
+ %high_part.i = shl i32 %x, 28
+ %low_part.i = lshr i32 %x, 4
+ %result.i = or i32 %high_part.i, %low_part.i
+ %high_part.i.1 = shl i32 %result.i, 26
+ %low_part.i.2 = lshr i32 %result.i, 6
+ %result.i.3 = or i32 %low_part.i.2, %high_part.i.1
+ ret i32 %result.i.3
+}
+
+; the same vector test
+define <2 x i32> @test2(<2 x i32> %x) nounwind readnone {
+; CHECK-LABEL: test2:
+; CHECK: ror r0, r0, #10
+; CHECK: ror r1, r1, #10
+; CHECK: bx lr
+entry:
+ %high_part.i = shl <2 x i32> %x, <i32 28, i32 28>
+ %low_part.i = lshr <2 x i32> %x, <i32 4, i32 4>
+ %result.i = or <2 x i32> %high_part.i, %low_part.i
+ %high_part.i.1 = shl <2 x i32> %result.i, <i32 26, i32 26>
+ %low_part.i.2 = lshr <2 x i32> %result.i, <i32 6, i32 6>
+ %result.i.3 = or <2 x i32> %low_part.i.2, %high_part.i.1
+ ret <2 x i32> %result.i.3
+}
+
diff --git a/test/CodeGen/ARM/scavenging.mir b/test/CodeGen/ARM/scavenging.mir
new file mode 100644
index 000000000000..09040a3bd217
--- /dev/null
+++ b/test/CodeGen/ARM/scavenging.mir
@@ -0,0 +1,66 @@
+# RUN: llc -o - %s -mtriple=arm-arm-none-eabi -mcpu=cortex-m0 -run-pass scavenger-test | FileCheck %s
+---
+# CHECK-LABEL: name: scavengebug0
+# Make sure we are not spilling/using a physreg used in the very last
+# instruction of the scavenging range.
+# CHECK-NOT: tSTRi {{.*}}%r0,{{.*}}%r0
+# CHECK-NOT: tSTRi {{.*}}%r1,{{.*}}%r1
+# CHECK-NOT: tSTRi {{.*}}%r2,{{.*}}%r2
+# CHECK-NOT: tSTRi {{.*}}%r3,{{.*}}%r3
+# CHECK-NOT: tSTRi {{.*}}%r4,{{.*}}%r4
+# CHECK-NOT: tSTRi {{.*}}%r5,{{.*}}%r5
+# CHECK-NOT: tSTRi {{.*}}%r6,{{.*}}%r6
+# CHECK-NOT: tSTRi {{.*}}%r7,{{.*}}%r7
+name: scavengebug0
+body: |
+ bb.0:
+ ; Bring up register pressure to force emergency spilling
+ %r0 = IMPLICIT_DEF
+ %r1 = IMPLICIT_DEF
+ %r2 = IMPLICIT_DEF
+ %r3 = IMPLICIT_DEF
+ %r4 = IMPLICIT_DEF
+ %r5 = IMPLICIT_DEF
+ %r6 = IMPLICIT_DEF
+ %r7 = IMPLICIT_DEF
+
+ %0 : tgpr = IMPLICIT_DEF
+ %0 = tADDhirr %0, %sp, 14, _
+ tSTRi %r0, %0, 0, 14, _
+
+ %1 : tgpr = IMPLICIT_DEF
+ %1 = tADDhirr %1, %sp, 14, _
+ tSTRi %r1, %1, 0, 14, _
+
+ %2 : tgpr = IMPLICIT_DEF
+ %2 = tADDhirr %2, %sp, 14, _
+ tSTRi %r2, %2, 0, 14, _
+
+ %3 : tgpr = IMPLICIT_DEF
+ %3 = tADDhirr %3, %sp, 14, _
+ tSTRi %r3, %3, 0, 14, _
+
+ %4 : tgpr = IMPLICIT_DEF
+ %4 = tADDhirr %4, %sp, 14, _
+ tSTRi %r4, %4, 0, 14, _
+
+ %5 : tgpr = IMPLICIT_DEF
+ %5 = tADDhirr %5, %sp, 14, _
+ tSTRi %r5, %5, 0, 14, _
+
+ %6 : tgpr = IMPLICIT_DEF
+ %6 = tADDhirr %6, %sp, 14, _
+ tSTRi %r6, %6, 0, 14, _
+
+ %7 : tgpr = IMPLICIT_DEF
+ %7 = tADDhirr %7, %sp, 14, _
+ tSTRi %r7, %7, 0, 14, _
+
+ KILL %r0
+ KILL %r1
+ KILL %r2
+ KILL %r3
+ KILL %r4
+ KILL %r5
+ KILL %r6
+ KILL %r7
diff --git a/test/CodeGen/AVR/branch-relaxation.ll b/test/CodeGen/AVR/branch-relaxation.ll
new file mode 100644
index 000000000000..d6f07f653576
--- /dev/null
+++ b/test/CodeGen/AVR/branch-relaxation.ll
@@ -0,0 +1,96 @@
+; RUN: llc < %s -march=avr | FileCheck %s
+
+; CHECKC-LABEL: relax_breq
+; CHECK: cpi r{{[0-9]+}}, 0
+; CHECK: brne LBB0_1
+; CHECK: rjmp LBB0_2
+; LBB0_1:
+
+define i8 @relax_breq(i1 %a) {
+entry-block:
+ br i1 %a, label %hello, label %finished
+
+hello:
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ br label %finished
+finished:
+ ret i8 3
+}
+
+; CHECKC-LABEL: no_relax_breq
+; CHECK: cpi r{{[0-9]+}}, 0
+; CHECK: breq [[END_BB:LBB[0-9]+_[0-9]+]]
+; CHECK: nop
+; ...
+; LBB0_1:
+define i8 @no_relax_breq(i1 %a) {
+entry-block:
+ br i1 %a, label %hello, label %finished
+
+hello:
+ ; There are not enough NOPs to require relaxation.
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ call void asm sideeffect "nop", ""()
+ br label %finished
+finished:
+ ret i8 3
+}
+
diff --git a/test/CodeGen/AVR/ctlz.ll b/test/CodeGen/AVR/ctlz.ll
index 4f73e846b1f1..8659550baf90 100644
--- a/test/CodeGen/AVR/ctlz.ll
+++ b/test/CodeGen/AVR/ctlz.ll
@@ -10,7 +10,8 @@ declare i8 @llvm.ctlz.i8(i8)
; CHECK-LABEL: count_leading_zeros:
; CHECK: cpi [[RESULT:r[0-9]+]], 0
-; CHECK: breq LBB0_1
+; CHECK: brne LBB0_1
+; CHECK: rjmp LBB0_2
; CHECK: mov [[SCRATCH:r[0-9]+]], {{.*}}[[RESULT]]
; CHECK: lsr {{.*}}[[SCRATCH]]
; CHECK: or {{.*}}[[SCRATCH]], {{.*}}[[RESULT]]
@@ -43,6 +44,6 @@ declare i8 @llvm.ctlz.i8(i8)
; CHECK: add {{.*}}[[RESULT]], {{.*}}[[SCRATCH]]
; CHECK: andi {{.*}}[[RESULT]], 15
; CHECK: ret
-; CHECK: LBB0_1:
+; CHECK: LBB0_2:
; CHECK: ldi {{.*}}[[RESULT]], 8
; CHECK: ret
diff --git a/test/CodeGen/AVR/cttz.ll b/test/CodeGen/AVR/cttz.ll
index 2501566275ea..02d36954f526 100644
--- a/test/CodeGen/AVR/cttz.ll
+++ b/test/CodeGen/AVR/cttz.ll
@@ -10,7 +10,7 @@ declare i8 @llvm.cttz.i8(i8)
; CHECK-LABEL: count_trailing_zeros:
; CHECK: cpi [[RESULT:r[0-9]+]], 0
-; CHECK: breq LBB0_1
+; CHECK: breq [[END_BB:LBB[0-9]+_[0-9]+]]
; CHECK: mov [[SCRATCH:r[0-9]+]], {{.*}}[[RESULT]]
; CHECK: dec {{.*}}[[SCRATCH]]
; CHECK: com {{.*}}[[RESULT]]
@@ -34,7 +34,7 @@ declare i8 @llvm.cttz.i8(i8)
; CHECK: andi {{.*}}[[SCRATCH]], 15
; CHECK: mov {{.*}}[[RESULT]], {{.*}}[[SCRATCH]]
; CHECK: ret
-; CHECK: LBB0_1:
+; CHECK: [[END_BB]]:
; CHECK: ldi {{.*}}[[SCRATCH]], 8
; CHECK: mov {{.*}}[[RESULT]], {{.*}}[[SCRATCH]]
; CHECK: ret
diff --git a/test/CodeGen/AVR/frmidx-iterator-bug.ll b/test/CodeGen/AVR/frmidx-iterator-bug.ll
new file mode 100644
index 000000000000..f9e2f0688faf
--- /dev/null
+++ b/test/CodeGen/AVR/frmidx-iterator-bug.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -march=avr -mattr=avr6 | FileCheck %s
+
+%str_slice = type { i8*, i16 }
+%Machine = type { i16, [0 x i8], i16, [0 x i8], [16 x i8], [0 x i8] }
+
+; CHECK-LABEL: step
+define void @step(%Machine*) {
+ ret void
+}
+
+; CHECK-LABEL: main
+define void @main() {
+start:
+ %machine = alloca %Machine, align 8
+ %v0 = bitcast %Machine* %machine to i8*
+ %v1 = getelementptr inbounds %Machine, %Machine* %machine, i16 0, i32 2
+ %v2 = load i16, i16* %v1, align 2
+ br label %bb2.i5
+
+bb2.i5:
+ %v18 = load volatile i8, i8* inttoptr (i16 77 to i8*), align 1
+ %v19 = icmp sgt i8 %v18, -1
+ br i1 %v19, label %bb2.i5, label %bb.exit6
+
+bb.exit6:
+ %v20 = load volatile i8, i8* inttoptr (i16 78 to i8*), align 2
+ br label %bb7
+
+bb7:
+ call void @step(%Machine* %machine)
+ br label %bb7
+}
+
diff --git a/test/CodeGen/AVR/icall-func-pointer-correct-addr-space.ll b/test/CodeGen/AVR/icall-func-pointer-correct-addr-space.ll
new file mode 100644
index 000000000000..17ac29e2cdb8
--- /dev/null
+++ b/test/CodeGen/AVR/icall-func-pointer-correct-addr-space.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mattr=lpm,lpmw < %s -march=avr | FileCheck %s
+
+declare void @callback(i16 zeroext)
+
+; CHECK-LABEL: foo
+define void @foo() {
+entry:
+ ; CHECK: ldi r{{[0-9]+}}, pm_lo8(callback)
+ ; CHECK-NEXT: ldi r{{[0-9]+}}, pm_hi8(callback)
+ call void @bar(i8 zeroext undef, void (i16)* @callback)
+ ret void
+}
+
+declare void @bar(i8 zeroext, void (i16)*)
+
diff --git a/test/CodeGen/AVR/pseudo/ANDIWRdK.mir b/test/CodeGen/AVR/pseudo/ANDIWRdK.mir
index bcea4e6dfe27..4d58c85f4f23 100644
--- a/test/CodeGen/AVR/pseudo/ANDIWRdK.mir
+++ b/test/CodeGen/AVR/pseudo/ANDIWRdK.mir
@@ -17,8 +17,8 @@ body: |
; CHECK-LABEL: test_andiwrdrr
- ; CHECK: %r20 = ANDIRdK %r20, 175, implicit-def dead %sreg
- ; CHECK-NEXT: %r21 = ANDIRdK %r21, 250, implicit-def %sreg
+ ; CHECK: %r16 = ANDIRdK %r16, 175, implicit-def dead %sreg
+ ; CHECK-NEXT: %r17 = ANDIRdK %r17, 250, implicit-def %sreg
- %r21r20 = ANDIWRdK %r17r16, 64175, implicit-def %sreg
+ %r17r16 = ANDIWRdK %r17r16, 64175, implicit-def %sreg
...
diff --git a/test/CodeGen/AVR/pseudo/COMWRd.mir b/test/CodeGen/AVR/pseudo/COMWRd.mir
index 58ff7af7cb3c..db68a4082b73 100644
--- a/test/CodeGen/AVR/pseudo/COMWRd.mir
+++ b/test/CodeGen/AVR/pseudo/COMWRd.mir
@@ -20,5 +20,5 @@ body: |
; CHECK: %r14 = COMRd %r14, implicit-def dead %sreg
; CHECK-NEXT: %r15 = COMRd %r15, implicit-def %sreg
- %r15r14 = COMWRd %r9r8, implicit-def %sreg
+ %r15r14 = COMWRd %r15r14, implicit-def %sreg
...
diff --git a/test/CodeGen/AVR/pseudo/ORIWRdK.mir b/test/CodeGen/AVR/pseudo/ORIWRdK.mir
index d77a6ba88488..eaa12842df42 100644
--- a/test/CodeGen/AVR/pseudo/ORIWRdK.mir
+++ b/test/CodeGen/AVR/pseudo/ORIWRdK.mir
@@ -20,5 +20,5 @@ body: |
; CHECK: %r20 = ORIRdK %r20, 175, implicit-def dead %sreg
; CHECK-NEXT: %r21 = ORIRdK %r21, 250, implicit-def %sreg
- %r21r20 = ORIWRdK %r17r16, 64175, implicit-def %sreg
+ %r21r20 = ORIWRdK %r21r20, 64175, implicit-def %sreg
...
diff --git a/test/CodeGen/AVR/pseudo/SBCIWRdK.mir b/test/CodeGen/AVR/pseudo/SBCIWRdK.mir
index 644e6106ee79..a92f6951798b 100644
--- a/test/CodeGen/AVR/pseudo/SBCIWRdK.mir
+++ b/test/CodeGen/AVR/pseudo/SBCIWRdK.mir
@@ -20,5 +20,5 @@ body: |
; CHECK: %r20 = SBCIRdK %r20, 175, implicit-def %sreg, implicit killed %sreg
; CHECK-NEXT: %r21 = SBCIRdK %r21, 250, implicit-def %sreg, implicit killed %sreg
- %r21r20 = SBCIWRdK %r17r16, 64175, implicit-def %sreg, implicit %sreg
+ %r21r20 = SBCIWRdK %r21r20, 64175, implicit-def %sreg, implicit %sreg
...
diff --git a/test/CodeGen/AVR/pseudo/SUBIWRdK.mir b/test/CodeGen/AVR/pseudo/SUBIWRdK.mir
index c7d88d7ab3f6..38ff880a5172 100644
--- a/test/CodeGen/AVR/pseudo/SUBIWRdK.mir
+++ b/test/CodeGen/AVR/pseudo/SUBIWRdK.mir
@@ -20,5 +20,5 @@ body: |
; CHECK: %r20 = SUBIRdK %r20, 175, implicit-def %sreg
; CHECK-NEXT: %r21 = SBCIRdK %r21, 250, implicit-def %sreg, implicit killed %sreg
- %r21r20 = SUBIWRdK %r17r16, 64175, implicit-def %sreg
+ %r21r20 = SUBIWRdK %r21r20, 64175, implicit-def %sreg
...
diff --git a/test/CodeGen/AVR/select-mbb-placement-bug.ll b/test/CodeGen/AVR/select-mbb-placement-bug.ll
index ca7ec1ab831c..aca9502b5dfb 100644
--- a/test/CodeGen/AVR/select-mbb-placement-bug.ll
+++ b/test/CodeGen/AVR/select-mbb-placement-bug.ll
@@ -8,9 +8,9 @@ define internal fastcc void @loopy() {
;
; https://github.com/avr-rust/rust/issues/49
-; CHECK: LBB0_1:
-; CHECK: LBB0_2:
-; CHECK-NOT: LBB0_3:
+; CHECK: LBB0_{{[0-9]+}}:
+; CHECK: LBB0_{{[0-9]+}}:
+; CHECK-NOT: LBB0_{{[0-9]+}}:
start:
br label %bb7.preheader
diff --git a/test/CodeGen/BPF/undef.ll b/test/CodeGen/BPF/undef.ll
index de14bfde1ab9..8d8a5f429514 100644
--- a/test/CodeGen/BPF/undef.ll
+++ b/test/CodeGen/BPF/undef.ll
@@ -1,4 +1,5 @@
-; RUN: not llc < %s -march=bpf | FileCheck %s
+; RUN: not llc < %s -march=bpfel | FileCheck -check-prefixes=CHECK,EL %s
+; RUN: not llc < %s -march=bpfeb | FileCheck -check-prefixes=CHECK,EB %s
%struct.bpf_map_def = type { i32, i32, i32, i32 }
%struct.__sk_buff = type opaque
@@ -13,36 +14,31 @@
; Function Attrs: nounwind uwtable
define i32 @ebpf_filter(%struct.__sk_buff* nocapture readnone %ebpf_packet) #0 section "socket1" {
-; CHECK: r2 = r10
-; CHECK: r2 += -2
-; CHECK: r1 = 0
-; CHECK: *(u16 *)(r2 + 6) = r1
-; CHECK: *(u16 *)(r2 + 4) = r1
-; CHECK: *(u16 *)(r2 + 2) = r1
-; CHECK: r2 = 6
-; CHECK: *(u8 *)(r10 - 7) = r2
-; CHECK: r2 = 5
-; CHECK: *(u8 *)(r10 - 8) = r2
-; CHECK: r2 = 7
-; CHECK: *(u8 *)(r10 - 6) = r2
-; CHECK: r2 = 8
-; CHECK: *(u8 *)(r10 - 5) = r2
-; CHECK: r2 = 9
-; CHECK: *(u8 *)(r10 - 4) = r2
-; CHECK: r2 = 10
-; CHECK: *(u8 *)(r10 - 3) = r2
-; CHECK: *(u16 *)(r10 + 24) = r1
-; CHECK: *(u16 *)(r10 + 22) = r1
-; CHECK: *(u16 *)(r10 + 20) = r1
-; CHECK: *(u16 *)(r10 + 18) = r1
-; CHECK: *(u16 *)(r10 + 16) = r1
-; CHECK: *(u16 *)(r10 + 14) = r1
-; CHECK: *(u16 *)(r10 + 12) = r1
-; CHECK: *(u16 *)(r10 + 10) = r1
-; CHECK: *(u16 *)(r10 + 8) = r1
-; CHECK: *(u16 *)(r10 + 6) = r1
-; CHECK: *(u16 *)(r10 - 2) = r1
-; CHECK: *(u16 *)(r10 + 26) = r1
+; CHECK: r1 = r10
+; CHECK: r1 += -2
+; CHECK: r2 = 0
+; CHECK: *(u16 *)(r1 + 6) = r2
+; CHECK: *(u16 *)(r1 + 4) = r2
+; CHECK: *(u16 *)(r1 + 2) = r2
+; EL: r1 = 134678021
+; EB: r1 = 84281096
+; CHECK: *(u32 *)(r10 - 8) = r1
+; CHECK: r1 = 9
+; CHECK: *(u8 *)(r10 - 4) = r1
+; CHECK: r1 = 10
+; CHECK: *(u8 *)(r10 - 3) = r1
+; CHECK: *(u16 *)(r10 + 24) = r2
+; CHECK: *(u16 *)(r10 + 22) = r2
+; CHECK: *(u16 *)(r10 + 20) = r2
+; CHECK: *(u16 *)(r10 + 18) = r2
+; CHECK: *(u16 *)(r10 + 16) = r2
+; CHECK: *(u16 *)(r10 + 14) = r2
+; CHECK: *(u16 *)(r10 + 12) = r2
+; CHECK: *(u16 *)(r10 + 10) = r2
+; CHECK: *(u16 *)(r10 + 8) = r2
+; CHECK: *(u16 *)(r10 + 6) = r2
+; CHECK: *(u16 *)(r10 - 2) = r2
+; CHECK: *(u16 *)(r10 + 26) = r2
; CHECK: r2 = r10
; CHECK: r2 += -8
; CHECK: r1 = <MCOperand Expr:(routing)>ll
diff --git a/test/CodeGen/Generic/pr33094.ll b/test/CodeGen/Generic/pr33094.ll
new file mode 100644
index 000000000000..afa464f63f66
--- /dev/null
+++ b/test/CodeGen/Generic/pr33094.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s
+
+; PR33094
+; Make sure that a constant extractvalue doesn't cause a crash in
+; SelectionDAGBuilder::visitExtractValue.
+
+%A = type {}
+%B = type {}
+%Tuple = type { i64 }
+
+@A_Inst = global %A zeroinitializer
+@B_Inst = global %B zeroinitializer
+
+define i64 @foo() {
+ ret i64 extractvalue (%Tuple select (i1 icmp eq
+ (%B* bitcast (%A* @A_Inst to %B*), %B* @B_Inst),
+ %Tuple { i64 33 }, %Tuple { i64 42 }), 0)
+}
diff --git a/test/CodeGen/Hexagon/convertdptoint.ll b/test/CodeGen/Hexagon/convertdptoint.ll
index a09c2fd14b12..adf76e5dc82e 100644
--- a/test/CodeGen/Hexagon/convertdptoint.ll
+++ b/test/CodeGen/Hexagon/convertdptoint.ll
@@ -12,10 +12,10 @@ entry:
%b = alloca double, align 8
%c = alloca double, align 8
store i32 0, i32* %retval
- store double 1.540000e+01, double* %a, align 8
- store double 9.100000e+00, double* %b, align 8
- %0 = load double, double* %a, align 8
- %1 = load double, double* %b, align 8
+ store volatile double 1.540000e+01, double* %a, align 8
+ store volatile double 9.100000e+00, double* %b, align 8
+ %0 = load volatile double, double* %a, align 8
+ %1 = load volatile double, double* %b, align 8
%add = fadd double %0, %1
store double %add, double* %c, align 8
%2 = load double, double* %c, align 8
diff --git a/test/CodeGen/Hexagon/convertdptoll.ll b/test/CodeGen/Hexagon/convertdptoll.ll
index f46d46cf76b1..6b5bf56a248b 100644
--- a/test/CodeGen/Hexagon/convertdptoll.ll
+++ b/test/CodeGen/Hexagon/convertdptoll.ll
@@ -17,8 +17,8 @@ entry:
%0 = load double, double* %a, align 8
%1 = load double, double* %b, align 8
%add = fadd double %0, %1
- store double %add, double* %c, align 8
- %2 = load double, double* %c, align 8
+ store volatile double %add, double* %c, align 8
+ %2 = load volatile double, double* %c, align 8
%conv = fptosi double %2 to i64
store i64 %conv, i64* %i, align 8
%3 = load i64, i64* %i, align 8
diff --git a/test/CodeGen/Hexagon/convertsptoint.ll b/test/CodeGen/Hexagon/convertsptoint.ll
index 7593e57d852f..939b3b06a8c7 100644
--- a/test/CodeGen/Hexagon/convertsptoint.ll
+++ b/test/CodeGen/Hexagon/convertsptoint.ll
@@ -17,8 +17,8 @@ entry:
%0 = load float, float* %a, align 4
%1 = load float, float* %b, align 4
%add = fadd float %0, %1
- store float %add, float* %c, align 4
- %2 = load float, float* %c, align 4
+ store volatile float %add, float* %c, align 4
+ %2 = load volatile float, float* %c, align 4
%conv = fptosi float %2 to i32
store i32 %conv, i32* %i, align 4
%3 = load i32, i32* %i, align 4
diff --git a/test/CodeGen/Hexagon/convertsptoll.ll b/test/CodeGen/Hexagon/convertsptoll.ll
index d8432cbc812b..f540397ccf5e 100644
--- a/test/CodeGen/Hexagon/convertsptoll.ll
+++ b/test/CodeGen/Hexagon/convertsptoll.ll
@@ -17,8 +17,8 @@ entry:
%0 = load float, float* %a, align 4
%1 = load float, float* %b, align 4
%add = fadd float %0, %1
- store float %add, float* %c, align 4
- %2 = load float, float* %c, align 4
+ store volatile float %add, float* %c, align 4
+ %2 = load volatile float, float* %c, align 4
%conv = fptosi float %2 to i64
store i64 %conv, i64* %i, align 8
%3 = load i64, i64* %i, align 8
diff --git a/test/CodeGen/Hexagon/dadd.ll b/test/CodeGen/Hexagon/dadd.ll
index 5fcd705bab23..3068f499d12d 100644
--- a/test/CodeGen/Hexagon/dadd.ll
+++ b/test/CodeGen/Hexagon/dadd.ll
@@ -9,10 +9,10 @@ entry:
%a = alloca double, align 8
%b = alloca double, align 8
%c = alloca double, align 8
- store double 1.540000e+01, double* %a, align 8
- store double 9.100000e+00, double* %b, align 8
- %0 = load double, double* %a, align 8
- %1 = load double, double* %b, align 8
+ store volatile double 1.540000e+01, double* %a, align 8
+ store volatile double 9.100000e+00, double* %b, align 8
+ %0 = load volatile double, double* %a, align 8
+ %1 = load volatile double, double* %b, align 8
%add = fadd double %0, %1
store double %add, double* %c, align 8
ret i32 0
diff --git a/test/CodeGen/Hexagon/dmul.ll b/test/CodeGen/Hexagon/dmul.ll
index 1b79e0aa7d70..a6cf62b0c0aa 100644
--- a/test/CodeGen/Hexagon/dmul.ll
+++ b/test/CodeGen/Hexagon/dmul.ll
@@ -8,10 +8,10 @@ entry:
%a = alloca double, align 8
%b = alloca double, align 8
%c = alloca double, align 8
- store double 1.540000e+01, double* %a, align 8
- store double 9.100000e+00, double* %b, align 8
- %0 = load double, double* %b, align 8
- %1 = load double, double* %a, align 8
+ store volatile double 1.540000e+01, double* %a, align 8
+ store volatile double 9.100000e+00, double* %b, align 8
+ %0 = load volatile double, double* %b, align 8
+ %1 = load volatile double, double* %a, align 8
%mul = fmul double %0, %1
store double %mul, double* %c, align 8
ret i32 0
diff --git a/test/CodeGen/Hexagon/doubleconvert-ieee-rnd-near.ll b/test/CodeGen/Hexagon/doubleconvert-ieee-rnd-near.ll
index 6bf8224904ec..ccc287c5f2bc 100644
--- a/test/CodeGen/Hexagon/doubleconvert-ieee-rnd-near.ll
+++ b/test/CodeGen/Hexagon/doubleconvert-ieee-rnd-near.ll
@@ -12,10 +12,10 @@ entry:
%b = alloca double, align 8
%c = alloca double, align 8
store i32 0, i32* %retval
- store double 1.540000e+01, double* %a, align 8
- store double 9.100000e+00, double* %b, align 8
- %0 = load double, double* %a, align 8
- %1 = load double, double* %b, align 8
+ store volatile double 1.540000e+01, double* %a, align 8
+ store volatile double 9.100000e+00, double* %b, align 8
+ %0 = load volatile double, double* %a, align 8
+ %1 = load volatile double, double* %b, align 8
%add = fadd double %0, %1
store double %add, double* %c, align 8
%2 = load double, double* %c, align 8
diff --git a/test/CodeGen/Hexagon/dsub.ll b/test/CodeGen/Hexagon/dsub.ll
index 8b37301d84fb..d7e44b307cf8 100644
--- a/test/CodeGen/Hexagon/dsub.ll
+++ b/test/CodeGen/Hexagon/dsub.ll
@@ -8,10 +8,10 @@ entry:
%a = alloca double, align 8
%b = alloca double, align 8
%c = alloca double, align 8
- store double 1.540000e+01, double* %a, align 8
- store double 9.100000e+00, double* %b, align 8
- %0 = load double, double* %b, align 8
- %1 = load double, double* %a, align 8
+ store volatile double 1.540000e+01, double* %a, align 8
+ store volatile double 9.100000e+00, double* %b, align 8
+ %0 = load volatile double, double* %b, align 8
+ %1 = load volatile double, double* %a, align 8
%sub = fsub double %0, %1
store double %sub, double* %c, align 8
ret i32 0
diff --git a/test/CodeGen/Hexagon/fadd.ll b/test/CodeGen/Hexagon/fadd.ll
index 0418c1724f5b..65c6182dcc77 100644
--- a/test/CodeGen/Hexagon/fadd.ll
+++ b/test/CodeGen/Hexagon/fadd.ll
@@ -8,10 +8,10 @@ entry:
%a = alloca float, align 4
%b = alloca float, align 4
%c = alloca float, align 4
- store float 0x402ECCCCC0000000, float* %a, align 4
- store float 0x4022333340000000, float* %b, align 4
- %0 = load float, float* %a, align 4
- %1 = load float, float* %b, align 4
+ store volatile float 0x402ECCCCC0000000, float* %a, align 4
+ store volatile float 0x4022333340000000, float* %b, align 4
+ %0 = load volatile float, float* %a, align 4
+ %1 = load volatile float, float* %b, align 4
%add = fadd float %0, %1
store float %add, float* %c, align 4
ret i32 0
diff --git a/test/CodeGen/Hexagon/fmul.ll b/test/CodeGen/Hexagon/fmul.ll
index 552f98ec7a53..e20e293c0a13 100644
--- a/test/CodeGen/Hexagon/fmul.ll
+++ b/test/CodeGen/Hexagon/fmul.ll
@@ -9,10 +9,10 @@ entry:
%a = alloca float, align 4
%b = alloca float, align 4
%c = alloca float, align 4
- store float 0x402ECCCCC0000000, float* %a, align 4
- store float 0x4022333340000000, float* %b, align 4
- %0 = load float, float* %b, align 4
- %1 = load float, float* %a, align 4
+ store volatile float 0x402ECCCCC0000000, float* %a, align 4
+ store volatile float 0x4022333340000000, float* %b, align 4
+ %0 = load volatile float, float* %b, align 4
+ %1 = load volatile float, float* %a, align 4
%mul = fmul float %0, %1
store float %mul, float* %c, align 4
ret i32 0
diff --git a/test/CodeGen/Hexagon/fsub.ll b/test/CodeGen/Hexagon/fsub.ll
index d7b0e2f65b33..e9a1fa3d192b 100644
--- a/test/CodeGen/Hexagon/fsub.ll
+++ b/test/CodeGen/Hexagon/fsub.ll
@@ -8,10 +8,10 @@ entry:
%a = alloca float, align 4
%b = alloca float, align 4
%c = alloca float, align 4
- store float 0x402ECCCCC0000000, float* %a, align 4
- store float 0x4022333340000000, float* %b, align 4
- %0 = load float, float* %b, align 4
- %1 = load float, float* %a, align 4
+ store volatile float 0x402ECCCCC0000000, float* %a, align 4
+ store volatile float 0x4022333340000000, float* %b, align 4
+ %0 = load volatile float, float* %b, align 4
+ %1 = load volatile float, float* %a, align 4
%sub = fsub float %0, %1
store float %sub, float* %c, align 4
ret i32 0
diff --git a/test/CodeGen/Hexagon/hasfp-crash1.ll b/test/CodeGen/Hexagon/hasfp-crash1.ll
new file mode 100644
index 000000000000..1154a7117a70
--- /dev/null
+++ b/test/CodeGen/Hexagon/hasfp-crash1.ll
@@ -0,0 +1,82 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+;
+; Check that this testcase does not crash.
+; CHECK: jumpr r31
+
+target triple = "hexagon"
+
+; Function Attrs: nounwind
+declare i32 @foo0(i32*, i32, i64, i32, i8 zeroext, i8 zeroext, i32) local_unnamed_addr #0
+
+; Function Attrs: nounwind
+define i32 @foo1(i32* %a0, i32 %a1, i32 %a2, i32 %a3, i8 zeroext %a4, i8 zeroext %a5, i32 %a6) local_unnamed_addr #0 !dbg !33 {
+entry:
+ tail call void @llvm.dbg.value(metadata i32 %a6, i64 0, metadata !51, metadata !52), !dbg !53
+ ret i32 undef, !dbg !54
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="true" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv5" "target-features"="-hvx-double,-long-calls" }
+attributes #1 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!26, !27}
+!llvm.linker.options = !{!29, !30, !31, !32, !29, !30, !31, !32}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !22)
+!1 = !DIFile(filename: "foo.i", directory: "/path")
+!2 = !{!3, !16}
+!3 = !DICompositeType(tag: DW_TAG_enumeration_type, file: !4, line: 122, size: 8, elements: !5)
+!4 = !DIFile(filename: "foo.h", directory: "/path")
+!5 = !{!6, !7, !8, !9, !10, !11, !12, !13, !14, !15}
+!6 = !DIEnumerator(name: "E0", value: 7)
+!7 = !DIEnumerator(name: "E1", value: 6)
+!8 = !DIEnumerator(name: "E2", value: 5)
+!9 = !DIEnumerator(name: "E3", value: 0)
+!10 = !DIEnumerator(name: "E4", value: 1)
+!11 = !DIEnumerator(name: "E5", value: 7)
+!12 = !DIEnumerator(name: "E6", value: 5)
+!13 = !DIEnumerator(name: "E7", value: 4)
+!14 = !DIEnumerator(name: "E8", value: 4)
+!15 = !DIEnumerator(name: "E9", value: 10)
+!16 = !DICompositeType(tag: DW_TAG_enumeration_type, file: !4, line: 136, size: 8, elements: !17)
+!17 = !{!18, !19, !20, !21}
+!18 = !DIEnumerator(name: "F0", value: 1)
+!19 = !DIEnumerator(name: "F1", value: 2)
+!20 = !DIEnumerator(name: "F2", value: 4)
+!21 = !DIEnumerator(name: "F3", value: 7)
+!22 = !{!23, !24, !25}
+!23 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!24 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+!25 = !DIDerivedType(tag: DW_TAG_typedef, name: "t0_t", file: !4, line: 38, baseType: !24)
+!26 = !{i32 2, !"Debug Info Version", i32 3}
+!27 = !{i32 6, !"Linker Options", !28}
+!28 = !{!29, !30, !31, !32}
+!29 = !{!"foo0", !".text"}
+!30 = !{!"foo1", !".text"}
+!31 = !{!"foo2", !".text"}
+!32 = !{!"foo3", !".text"}
+!33 = distinct !DISubprogram(name: "foo1", scope: !34, file: !34, line: 84, type: !35, isLocal: false, isDefinition: true, scopeLine: 85, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !44)
+!34 = !DIFile(filename: "foo.c", directory: "/path")
+!35 = !DISubroutineType(types: !36)
+!36 = !{!37, !38, !39, !40, !41, !42, !43, !37}
+!37 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!38 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !23, size: 32)
+!39 = !DIDerivedType(tag: DW_TAG_typedef, name: "t1_t", file: !4, line: 35, baseType: !23)
+!40 = !DIDerivedType(tag: DW_TAG_typedef, name: "t2_t", file: !4, line: 36, baseType: !23)
+!41 = !DIDerivedType(tag: DW_TAG_typedef, name: "t3_t", file: !4, line: 43, baseType: !23)
+!42 = !DIDerivedType(tag: DW_TAG_typedef, name: "t4_t", file: !4, line: 133, baseType: !3)
+!43 = !DIDerivedType(tag: DW_TAG_typedef, name: "t5_t", file: !4, line: 141, baseType: !16)
+!44 = !{!45, !46, !47, !48, !49, !50, !51}
+!45 = !DILocalVariable(name: "a0", arg: 1, scope: !33, file: !34, line: 84, type: !38)
+!46 = !DILocalVariable(name: "a1", arg: 2, scope: !33, file: !34, line: 84, type: !39)
+!47 = !DILocalVariable(name: "a2", arg: 3, scope: !33, file: !34, line: 84, type: !40)
+!48 = !DILocalVariable(name: "a3", arg: 4, scope: !33, file: !34, line: 84, type: !41)
+!49 = !DILocalVariable(name: "a4", arg: 5, scope: !33, file: !34, line: 84, type: !42)
+!50 = !DILocalVariable(name: "a5", arg: 6, scope: !33, file: !34, line: 84, type: !43)
+!51 = !DILocalVariable(name: "a6", arg: 7, scope: !33, file: !34, line: 84, type: !37)
+!52 = !DIExpression()
+!53 = !DILocation(line: 84, column: 169, scope: !33)
+!54 = !DILocation(line: 86, column: 5, scope: !33)
diff --git a/test/CodeGen/Hexagon/hasfp-crash2.ll b/test/CodeGen/Hexagon/hasfp-crash2.ll
new file mode 100644
index 000000000000..c8b49948ce74
--- /dev/null
+++ b/test/CodeGen/Hexagon/hasfp-crash2.ll
@@ -0,0 +1,83 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+;
+; Check that this testcase does not crash.
+; CHECK: call foo0
+
+target triple = "hexagon"
+
+; Function Attrs: nounwind
+declare void @foo0() local_unnamed_addr #0
+
+; Function Attrs: nounwind
+define void @foo1() local_unnamed_addr #0 !dbg !33 {
+entry:
+ tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !51, metadata !52), !dbg !53
+ tail call void @foo0(), !dbg !54
+ ret void
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="true" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv5" "target-features"="-hvx-double,-long-calls" }
+attributes #1 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!26, !27}
+!llvm.linker.options = !{!29, !30, !31, !32, !29, !30, !31, !32, !29, !30, !31, !32, !29, !30, !31, !32, !29, !30, !31, !32, !29, !30, !31, !32}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !22)
+!1 = !DIFile(filename: "foo.i", directory: "/path")
+!2 = !{!3, !16}
+!3 = !DICompositeType(tag: DW_TAG_enumeration_type, file: !4, line: 122, size: 8, elements: !5)
+!4 = !DIFile(filename: "foo.h", directory: "/path")
+!5 = !{!6, !7, !8, !9, !10, !11, !12, !13, !14, !15}
+!6 = !DIEnumerator(name: "E0", value: 7)
+!7 = !DIEnumerator(name: "E1", value: 6)
+!8 = !DIEnumerator(name: "E2", value: 5)
+!9 = !DIEnumerator(name: "E3", value: 0)
+!10 = !DIEnumerator(name: "E4", value: 1)
+!11 = !DIEnumerator(name: "E5", value: 7)
+!12 = !DIEnumerator(name: "E6", value: 5)
+!13 = !DIEnumerator(name: "E7", value: 4)
+!14 = !DIEnumerator(name: "E8", value: 4)
+!15 = !DIEnumerator(name: "E9", value: 10)
+!16 = !DICompositeType(tag: DW_TAG_enumeration_type, file: !4, line: 136, size: 8, elements: !17)
+!17 = !{!18, !19, !20, !21}
+!18 = !DIEnumerator(name: "F0", value: 1)
+!19 = !DIEnumerator(name: "F1", value: 2)
+!20 = !DIEnumerator(name: "F2", value: 4)
+!21 = !DIEnumerator(name: "F3", value: 7)
+!22 = !{!23, !24, !25}
+!23 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!24 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+!25 = !DIDerivedType(tag: DW_TAG_typedef, name: "t0_t", file: !4, line: 38, baseType: !24)
+!26 = !{i32 2, !"Debug Info Version", i32 3}
+!27 = !{i32 6, !"Linker Options", !28}
+!28 = !{!29, !30, !31, !32}
+!29 = !{!"foo0", !".text"}
+!30 = !{!"foo1", !".text"}
+!31 = !{!"foo2", !".text"}
+!32 = !{!"foo3", !".text"}
+!33 = distinct !DISubprogram(name: "foo1", scope: !34, file: !34, line: 84, type: !35, isLocal: false, isDefinition: true, scopeLine: 85, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !44)
+!34 = !DIFile(filename: "foo.c", directory: "/path")
+!35 = !DISubroutineType(types: !36)
+!36 = !{!37, !38, !39, !40, !41, !42, !43, !37}
+!37 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!38 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !23, size: 32)
+!39 = !DIDerivedType(tag: DW_TAG_typedef, name: "t1_t", file: !4, line: 35, baseType: !23)
+!40 = !DIDerivedType(tag: DW_TAG_typedef, name: "t2_t", file: !4, line: 36, baseType: !23)
+!41 = !DIDerivedType(tag: DW_TAG_typedef, name: "t3_t", file: !4, line: 43, baseType: !23)
+!42 = !DIDerivedType(tag: DW_TAG_typedef, name: "t4_t", file: !4, line: 133, baseType: !3)
+!43 = !DIDerivedType(tag: DW_TAG_typedef, name: "t5_t", file: !4, line: 141, baseType: !16)
+!44 = !{!45, !46, !47, !48, !49, !50, !51}
+!45 = !DILocalVariable(name: "a0", arg: 1, scope: !33, file: !34, line: 84, type: !38)
+!46 = !DILocalVariable(name: "a1", arg: 2, scope: !33, file: !34, line: 84, type: !39)
+!47 = !DILocalVariable(name: "a2", arg: 3, scope: !33, file: !34, line: 84, type: !40)
+!48 = !DILocalVariable(name: "a3", arg: 4, scope: !33, file: !34, line: 84, type: !41)
+!49 = !DILocalVariable(name: "a4", arg: 5, scope: !33, file: !34, line: 84, type: !42)
+!50 = !DILocalVariable(name: "a5", arg: 6, scope: !33, file: !34, line: 84, type: !43)
+!51 = !DILocalVariable(name: "a6", arg: 7, scope: !33, file: !34, line: 84, type: !37)
+!52 = !DIExpression()
+!53 = !DILocation(line: 84, column: 169, scope: !33)
+!54 = !DILocation(line: 86, column: 12, scope: !33)
diff --git a/test/CodeGen/Hexagon/hvx-nontemporal.ll b/test/CodeGen/Hexagon/hvx-nontemporal.ll
new file mode 100644
index 000000000000..98c5ef4809b0
--- /dev/null
+++ b/test/CodeGen/Hexagon/hvx-nontemporal.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+target triple = "hexagon"
+
+; Function Attrs: norecurse nounwind
+define void @test(<32 x i32>* nocapture readonly %x, <32 x i32>* nocapture readnone %y, <32 x i32>* nocapture %a, <32 x i32>* nocapture %b) #0 {
+entry:
+; CHECK: v0 = vmem(r0+#7):nt
+ %add.ptr = getelementptr inbounds <32 x i32>, <32 x i32>* %x, i32 7
+ %0 = load <32 x i32>, <32 x i32>* %add.ptr, align 128, !tbaa !1, !nontemporal !4
+
+; CHECK: v1.cur = vmem(r2+#0):nt
+ %1 = load <32 x i32>, <32 x i32>* %a, align 128, !tbaa !1, !nontemporal !4
+
+; CHECK: vmem(r3+#3):nt = v1
+ %add.ptr2 = getelementptr inbounds <32 x i32>, <32 x i32>* %b, i32 3
+ store <32 x i32> %1, <32 x i32>* %add.ptr2, align 128, !tbaa !1, !nontemporal !4
+
+; CHECK: vmem(r2+#0):nt = v0
+ store <32 x i32> %0, <32 x i32>* %a, align 128, !tbaa !1, !nontemporal !4
+ ret void
+}
+
+attributes #0 = { norecurse nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-double" }
+
+!1 = !{!2, !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
+!4 = !{i32 1}
diff --git a/test/CodeGen/Hexagon/target-flag-ext.mir b/test/CodeGen/Hexagon/target-flag-ext.mir
new file mode 100644
index 000000000000..49e0d2870e00
--- /dev/null
+++ b/test/CodeGen/Hexagon/target-flag-ext.mir
@@ -0,0 +1,24 @@
+# RUN: llc -march=hexagon -run-pass hexagon-packetizer -o - %s | FileCheck %s
+---
+name: fred
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ ; Check that all these instructions go in the same packet. This is to
+ ; make sure that a target flag (other than HMOTF_ConstExtend) on an
+ ; operand will not be interpreted as a constant-extender flag.
+ ; The combination used below (pcrel + 0) does not technically make sense,
+ ; but combinations that do make sense require constant extending, so
+ ; testing this is not possible otherwise.
+
+ ; CHECK: BUNDLE
+ ; CHECK-DAG: %r0 = A2_tfrsi
+ ; CHECK-DAG: %r1 = A2_tfrsi
+ ; CHECK-DAG: %r2 = A2_tfrsi
+ ; CHECK: }
+ %r0 = A2_tfrsi target-flags (hexagon-pcrel) 0
+ %r1 = A2_tfrsi target-flags (hexagon-pcrel) 0
+ %r2 = A2_tfrsi target-flags (hexagon-pcrel) 0
+...
+
diff --git a/test/CodeGen/MIR/AArch64/atomic-memoperands.mir b/test/CodeGen/MIR/AArch64/atomic-memoperands.mir
index 1fe42a731488..1c81f580bee5 100644
--- a/test/CodeGen/MIR/AArch64/atomic-memoperands.mir
+++ b/test/CodeGen/MIR/AArch64/atomic-memoperands.mir
@@ -14,7 +14,7 @@
# CHECK: %3(s16) = G_LOAD %0(p0) :: (load acquire 2)
# CHECK: G_STORE %3(s16), %0(p0) :: (store release 2)
# CHECK: G_STORE %2(s32), %0(p0) :: (store acq_rel 4)
-# CHECK: G_STORE %1(s64), %0(p0) :: (store singlethread seq_cst 8)
+# CHECK: G_STORE %1(s64), %0(p0) :: (store syncscope("singlethread") seq_cst 8)
name: atomic_memoperands
body: |
bb.0:
@@ -25,6 +25,6 @@ body: |
%3:_(s16) = G_LOAD %0(p0) :: (load acquire 2)
G_STORE %3(s16), %0(p0) :: (store release 2)
G_STORE %2(s32), %0(p0) :: (store acq_rel 4)
- G_STORE %1(s64), %0(p0) :: (store singlethread seq_cst 8)
+ G_STORE %1(s64), %0(p0) :: (store syncscope("singlethread") seq_cst 8)
RET_ReallyLR
...
diff --git a/test/CodeGen/MIR/AArch64/invalid-target-memoperands.mir b/test/CodeGen/MIR/AArch64/invalid-target-memoperands.mir
new file mode 100644
index 000000000000..731d7165b9df
--- /dev/null
+++ b/test/CodeGen/MIR/AArch64/invalid-target-memoperands.mir
@@ -0,0 +1,19 @@
+# RUN: not llc -mtriple=aarch64-none-linux-gnu -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+
+ define void @target_memoperands_error() {
+ ret void
+ }
+
+...
+---
+name: target_memoperands_error
+body: |
+ bb.0:
+
+ %0:_(p0) = COPY %x0
+ ; CHECK: [[@LINE+1]]:35: use of undefined target MMO flag 'aarch64-invalid'
+ %1:_(s64) = G_LOAD %0(p0) :: ("aarch64-invalid" load 8)
+ RET_ReallyLR
+...
diff --git a/test/CodeGen/MIR/AArch64/target-memoperands.mir b/test/CodeGen/MIR/AArch64/target-memoperands.mir
new file mode 100644
index 000000000000..f853b551e098
--- /dev/null
+++ b/test/CodeGen/MIR/AArch64/target-memoperands.mir
@@ -0,0 +1,22 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass none -o - %s | FileCheck %s
+
+--- |
+
+ define void @target_memoperands() {
+ ret void
+ }
+
+...
+---
+# CHECK-LABEL: name: target_memoperands
+# CHECK: %1(s64) = G_LOAD %0(p0) :: ("aarch64-suppress-pair" load 8)
+# CHECK: G_STORE %1(s64), %0(p0) :: ("aarch64-suppress-pair" store 8)
+name: target_memoperands
+body: |
+ bb.0:
+
+ %0:_(p0) = COPY %x0
+ %1:_(s64) = G_LOAD %0(p0) :: ("aarch64-suppress-pair" load 8)
+ G_STORE %1(s64), %0(p0) :: ("aarch64-suppress-pair" store 8)
+ RET_ReallyLR
+...
diff --git a/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir b/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir
index 7cef01c9d12d..c0251232fd5c 100644
--- a/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir
+++ b/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir
@@ -171,8 +171,8 @@ body: |
# CHECK-LABEL: name: add_f32_1.0_multi_f16_use
# CHECK: %13 = V_MOV_B32_e32 1065353216, implicit %exec
-# CHECK: %14 = V_ADD_F16_e32 %13, killed %11, implicit %exec
-# CHECK: %15 = V_ADD_F16_e32 killed %13, killed %12, implicit %exec
+# CHECK: %14 = V_ADD_F16_e32 killed %11, %13, implicit %exec
+# CHECK: %15 = V_ADD_F16_e32 killed %12, killed %13, implicit %exec
name: add_f32_1.0_multi_f16_use
@@ -307,8 +307,8 @@ body: |
# CHECK-LABEL: name: add_f32_1.0_one_f32_use_multi_f16_use
# CHECK: %14 = V_MOV_B32_e32 1065353216, implicit %exec
-# CHECK: %15 = V_ADD_F16_e32 %14, %11, implicit %exec
-# CHECK: %16 = V_ADD_F16_e32 %14, %12, implicit %exec
+# CHECK: %15 = V_ADD_F16_e32 %11, %14, implicit %exec
+# CHECK: %16 = V_ADD_F16_e32 %12, %14, implicit %exec
# CHECK: %17 = V_ADD_F32_e32 1065353216, killed %13, implicit %exec
name: add_f32_1.0_one_f32_use_multi_f16_use
@@ -514,8 +514,8 @@ body: |
# CHECK-LABEL: name: add_f16_1.0_multi_f32_use
# CHECK: %13 = V_MOV_B32_e32 15360, implicit %exec
-# CHECK: %14 = V_ADD_F32_e32 %13, %11, implicit %exec
-# CHECK: %15 = V_ADD_F32_e32 %13, %12, implicit %exec
+# CHECK: %14 = V_ADD_F32_e32 %11, %13, implicit %exec
+# CHECK: %15 = V_ADD_F32_e32 %12, %13, implicit %exec
name: add_f16_1.0_multi_f32_use
alignment: 0
@@ -581,8 +581,8 @@ body: |
# CHECK-LABEL: name: add_f16_1.0_other_high_bits_multi_f16_use
# CHECK: %13 = V_MOV_B32_e32 80886784, implicit %exec
-# CHECK: %14 = V_ADD_F16_e32 %13, %11, implicit %exec
-# CHECK: %15 = V_ADD_F16_e32 %13, %12, implicit %exec
+# CHECK: %14 = V_ADD_F16_e32 %11, %13, implicit %exec
+# CHECK: %15 = V_ADD_F16_e32 %12, %13, implicit %exec
name: add_f16_1.0_other_high_bits_multi_f16_use
alignment: 0
@@ -648,8 +648,8 @@ body: |
# CHECK-LABEL: name: add_f16_1.0_other_high_bits_use_f16_f32
# CHECK: %13 = V_MOV_B32_e32 305413120, implicit %exec
-# CHECK: %14 = V_ADD_F32_e32 %13, %11, implicit %exec
-# CHECK: %15 = V_ADD_F16_e32 %13, %12, implicit %exec
+# CHECK: %14 = V_ADD_F32_e32 %11, %13, implicit %exec
+# CHECK: %15 = V_ADD_F16_e32 %12, %13, implicit %exec
name: add_f16_1.0_other_high_bits_use_f16_f32
alignment: 0
exposesReturnsTwice: false
diff --git a/test/CodeGen/MIR/AMDGPU/syncscopes.mir b/test/CodeGen/MIR/AMDGPU/syncscopes.mir
new file mode 100644
index 000000000000..83506257d8bf
--- /dev/null
+++ b/test/CodeGen/MIR/AMDGPU/syncscopes.mir
@@ -0,0 +1,98 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -run-pass=none %s -o - | FileCheck --check-prefix=GCN %s
+
+--- |
+ ; ModuleID = '<stdin>'
+ source_filename = "<stdin>"
+ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+ target triple = "amdgcn-amd-amdhsa"
+
+ define void @syncscopes(i32 %agent, i32 addrspace(4)* %agent_out, i32 %workgroup, i32 addrspace(4)* %workgroup_out, i32 %wavefront, i32 addrspace(4)* %wavefront_out) #0 {
+ entry:
+ store atomic i32 %agent, i32 addrspace(4)* %agent_out syncscope("agent") seq_cst, align 4
+ store atomic i32 %workgroup, i32 addrspace(4)* %workgroup_out syncscope("workgroup") seq_cst, align 4
+ store atomic i32 %wavefront, i32 addrspace(4)* %wavefront_out syncscope("wavefront") seq_cst, align 4
+ ret void
+ }
+
+ ; Function Attrs: convergent nounwind
+ declare { i1, i64 } @llvm.amdgcn.if(i1) #1
+
+ ; Function Attrs: convergent nounwind
+ declare { i1, i64 } @llvm.amdgcn.else(i64) #1
+
+ ; Function Attrs: convergent nounwind readnone
+ declare i64 @llvm.amdgcn.break(i64) #2
+
+ ; Function Attrs: convergent nounwind readnone
+ declare i64 @llvm.amdgcn.if.break(i1, i64) #2
+
+ ; Function Attrs: convergent nounwind readnone
+ declare i64 @llvm.amdgcn.else.break(i64, i64) #2
+
+ ; Function Attrs: convergent nounwind
+ declare i1 @llvm.amdgcn.loop(i64) #1
+
+ ; Function Attrs: convergent nounwind
+ declare void @llvm.amdgcn.end.cf(i64) #1
+
+ attributes #0 = { "target-cpu"="gfx803" }
+ attributes #1 = { convergent nounwind }
+ attributes #2 = { convergent nounwind readnone }
+
+# GCN-LABEL: name: syncscopes
+# GCN: FLAT_STORE_DWORD killed %vgpr0_vgpr1, killed %vgpr2, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("agent") seq_cst 4 into %ir.agent_out)
+# GCN: FLAT_STORE_DWORD killed %vgpr0_vgpr1, killed %vgpr2, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out)
+# GCN: FLAT_STORE_DWORD killed %vgpr0_vgpr1, killed %vgpr2, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out)
+...
+---
+name: syncscopes
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%sgpr4_sgpr5' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0.entry:
+ liveins: %sgpr4_sgpr5
+
+ S_WAITCNT 0
+ %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr4_sgpr5, 8, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %sgpr6 = S_LOAD_DWORD_IMM %sgpr4_sgpr5, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+ %sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM %sgpr4_sgpr5, 24, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %sgpr7 = S_LOAD_DWORD_IMM %sgpr4_sgpr5, 16, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+ %sgpr8 = S_LOAD_DWORD_IMM %sgpr4_sgpr5, 32, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+ S_WAITCNT 127
+ %vgpr0 = V_MOV_B32_e32 %sgpr0, implicit %exec, implicit-def %vgpr0_vgpr1, implicit %sgpr0_sgpr1
+ %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr4_sgpr5, 40, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %vgpr1 = V_MOV_B32_e32 killed %sgpr1, implicit %exec, implicit killed %sgpr0_sgpr1, implicit %sgpr0_sgpr1, implicit %exec
+ %vgpr2 = V_MOV_B32_e32 killed %sgpr6, implicit %exec, implicit %exec
+ FLAT_STORE_DWORD killed %vgpr0_vgpr1, killed %vgpr2, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("agent") seq_cst 4 into %ir.agent_out)
+ S_WAITCNT 112
+ %vgpr0 = V_MOV_B32_e32 %sgpr2, implicit %exec, implicit-def %vgpr0_vgpr1, implicit %sgpr2_sgpr3
+ %vgpr1 = V_MOV_B32_e32 killed %sgpr3, implicit %exec, implicit killed %sgpr2_sgpr3, implicit %sgpr2_sgpr3, implicit %exec
+ %vgpr2 = V_MOV_B32_e32 killed %sgpr7, implicit %exec, implicit %exec
+ FLAT_STORE_DWORD killed %vgpr0_vgpr1, killed %vgpr2, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out)
+ S_WAITCNT 112
+ %vgpr0 = V_MOV_B32_e32 %sgpr4, implicit %exec, implicit-def %vgpr0_vgpr1, implicit %sgpr4_sgpr5
+ %vgpr1 = V_MOV_B32_e32 killed %sgpr5, implicit %exec, implicit killed %sgpr4_sgpr5, implicit %sgpr4_sgpr5, implicit %exec
+ %vgpr2 = V_MOV_B32_e32 killed %sgpr8, implicit %exec, implicit %exec
+ FLAT_STORE_DWORD killed %vgpr0_vgpr1, killed %vgpr2, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out)
+ S_ENDPGM
+
+...
diff --git a/test/CodeGen/MIR/AMDGPU/target-flags.mir b/test/CodeGen/MIR/AMDGPU/target-flags.mir
new file mode 100644
index 000000000000..7d288dd1b045
--- /dev/null
+++ b/test/CodeGen/MIR/AMDGPU/target-flags.mir
@@ -0,0 +1,29 @@
+# RUN: llc -march=amdgcn -run-pass none -o - %s | FileCheck %s
+--- |
+ define amdgpu_kernel void @flags() {
+ ret void
+ }
+
+ declare void @foo()
+...
+---
+
+# CHECK: SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead %scc
+# CHECK: %1 = S_MOV_B64 target-flags(amdgpu-gotprel) @foo
+
+name: flags
+liveins:
+ - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+ maxAlignment: 8
+registers:
+ - { id: 0, class: sreg_64, preferred-register: '' }
+ - { id: 1, class: sreg_64, preferred-register: '' }
+body: |
+ bb.0:
+ liveins: %sgpr0_sgpr1
+ %0 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead %scc
+ %1 = S_MOV_B64 target-flags(amdgpu-gotprel) @foo
+
+ S_ENDPGM
+...
diff --git a/test/CodeGen/MIR/Generic/runPass.mir b/test/CodeGen/MIR/Generic/runPass.mir
index 33380d4c6bb4..54c1dd221bdb 100644
--- a/test/CodeGen/MIR/Generic/runPass.mir
+++ b/test/CodeGen/MIR/Generic/runPass.mir
@@ -1,5 +1,6 @@
# RUN: llc -run-pass=greedy -debug-pass=Arguments -o - %s | FileCheck %s
# RUN: llc -run-pass=regallocbasic -debug-pass=Arguments -o - %s | FileCheck %s
+# RUN: llc -run-pass=regallocfast -debug-pass=Arguments -o - %s | FileCheck %s
# Check that passes are initialized correctly, so that it's possible to
# use -run-pass.
@@ -7,6 +8,7 @@
---
# CHECK: name: foo
name: foo
+tracksRegLiveness: true
body: |
bb.0:
...
diff --git a/test/CodeGen/MIR/Hexagon/target-flags.mir b/test/CodeGen/MIR/Hexagon/target-flags.mir
new file mode 100644
index 000000000000..656e0a6ea859
--- /dev/null
+++ b/test/CodeGen/MIR/Hexagon/target-flags.mir
@@ -0,0 +1,36 @@
+# RUN: llc -march=hexagon -run-pass none -o - %s | FileCheck %s
+---
+name: fred
+
+body: |
+ bb.0:
+
+ ; CHECK: target-flags(hexagon-pcrel)
+ %r0 = A2_tfrsi target-flags (hexagon-pcrel) 0
+ ; CHECK: target-flags(hexagon-got)
+ %r0 = A2_tfrsi target-flags (hexagon-got) 0
+ ; CHECK: target-flags(hexagon-lo16)
+ %r0 = A2_tfrsi target-flags (hexagon-lo16) 0
+ ; CHECK: target-flags(hexagon-hi16)
+ %r0 = A2_tfrsi target-flags (hexagon-hi16) 0
+ ; CHECK: target-flags(hexagon-gprel)
+ %r0 = A2_tfrsi target-flags (hexagon-gprel) 0
+ ; CHECK: target-flags(hexagon-gdgot)
+ %r0 = A2_tfrsi target-flags (hexagon-gdgot) 0
+ ; CHECK: target-flags(hexagon-gdplt)
+ %r0 = A2_tfrsi target-flags (hexagon-gdplt) 0
+ ; CHECK: target-flags(hexagon-ie)
+ %r0 = A2_tfrsi target-flags (hexagon-ie) 0
+ ; CHECK: target-flags(hexagon-iegot)
+ %r0 = A2_tfrsi target-flags (hexagon-iegot) 0
+ ; CHECK: target-flags(hexagon-tprel)
+ %r0 = A2_tfrsi target-flags (hexagon-tprel) 0
+
+ ; CHECK: target-flags(hexagon-ext)
+ %r0 = A2_tfrsi target-flags (hexagon-ext) 0
+ ; CHECK: target-flags(hexagon-pcrel, hexagon-ext)
+ %r0 = A2_tfrsi target-flags (hexagon-pcrel,hexagon-ext) 0
+ ; CHECK: target-flags(hexagon-ie, hexagon-ext)
+ %r0 = A2_tfrsi target-flags (hexagon-ie,hexagon-ext) 0
+...
+
diff --git a/test/CodeGen/MIR/X86/tied-physical-regs-match.mir b/test/CodeGen/MIR/X86/tied-physical-regs-match.mir
new file mode 100644
index 000000000000..1ddf649f76a7
--- /dev/null
+++ b/test/CodeGen/MIR/X86/tied-physical-regs-match.mir
@@ -0,0 +1,22 @@
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that the Machine Verifier detects tied physical registers
+# that doesn't match.
+
+--- |
+
+ define i32 @foo() {
+ entry:
+ ret i32 0
+ }
+
+...
+---
+name: foo
+body: |
+ bb.0.entry:
+ liveins: %rdi
+
+ ; CHECK: Tied physical registers must match.
+ %rbx = AND64rm killed %rdx, killed %rdi, 1, _, 0, _, implicit-def dead %eflags
+ RETQ %rbx
+...
diff --git a/test/CodeGen/MSP430/Inst16mm.ll b/test/CodeGen/MSP430/Inst16mm.ll
index 951002d60a03..14a799b91717 100644
--- a/test/CodeGen/MSP430/Inst16mm.ll
+++ b/test/CodeGen/MSP430/Inst16mm.ll
@@ -64,6 +64,6 @@ entry:
%0 = load i16, i16* %retval ; <i16> [#uses=1]
ret i16 %0
; CHECK-LABEL: mov2:
-; CHECK: mov.w 0(r1), 4(r1)
-; CHECK: mov.w 2(r1), 6(r1)
+; CHECK-DAG: mov.w 2(r1), 6(r1)
+; CHECK-DAG: mov.w 0(r1), 4(r1)
}
diff --git a/test/CodeGen/NVPTX/lower-aggr-copies.ll b/test/CodeGen/NVPTX/lower-aggr-copies.ll
index f522c6722ee6..4298442157e2 100644
--- a/test/CodeGen/NVPTX/lower-aggr-copies.ll
+++ b/test/CodeGen/NVPTX/lower-aggr-copies.ll
@@ -1,5 +1,6 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s --check-prefix PTX
; RUN: opt < %s -S -nvptx-lower-aggr-copies | FileCheck %s --check-prefix IR
+; RUN: opt < %s -S -nvptx-lower-aggr-copies -use-wide-memcpy-loop-lowering=true | FileCheck %s --check-prefix WIR
; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to
; llvm.mem* intrinsics get lowered to loops.
@@ -32,6 +33,23 @@ entry:
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
; PTX: @%p[[PRED]] bra LBB[[LABEL]]
+
+; WIR-LABEL: @memcpy_caller
+; WIR: entry:
+; WIR: [[LoopCount:%[0-9]+]] = udiv i64 %n, 1
+; WIR: [[ResidualSize:%[0-9]+]] = urem i64 %n, 1
+; WIR: [[Cond:%[0-9]+]] = icmp ne i64 [[LoopCount]], 0
+; WIR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+
+; WIR: loop-memcpy-expansion:
+; WIR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
+; WIR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
+; WIR: [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]]
+; WIR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
+; WIR: store i8 [[Load]], i8* [[DstGep]]
+; WIR: [[IndexInc]] = add i64 %loop-index, 1
+; WIR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], [[LoopCount]]
+; WIR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
}
define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 {
@@ -50,6 +68,23 @@ entry:
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
; PTX: @%p[[PRED]] bra LBB[[LABEL]]
+
+; WIR-LABEL: @memcpy_volatile_caller
+; WIR: entry:
+; WIR: [[LoopCount:%[0-9]+]] = udiv i64 %n, 1
+; WIR: [[ResidualSize:%[0-9]+]] = urem i64 %n, 1
+; WIR: [[Cond:%[0-9]+]] = icmp ne i64 [[LoopCount]], 0
+; WIR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+
+; WIR: loop-memcpy-expansion:
+; WIR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
+; WIR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
+; WIR: [[Load:%[0-9]+]] = load volatile i8, i8* [[SrcGep]]
+; WIR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
+; WIR: store volatile i8 [[Load]], i8* [[DstGep]]
+; WIR: [[IndexInc]] = add i64 %loop-index, 1
+; WIR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], [[LoopCount]]
+; WIR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
}
define i8* @memcpy_casting_caller(i32* %dst, i32* %src, i64 %n) #0 {
@@ -65,6 +100,32 @@ entry:
; IR: [[SRCCAST:%[0-9]+]] = bitcast i32* %src to i8*
; IR: getelementptr inbounds i8, i8* [[SRCCAST]]
; IR: getelementptr inbounds i8, i8* [[DSTCAST]]
+
+; WIR-LABEL: @memcpy_casting_caller
+; WIR: [[DSTCAST:%[0-9]+]] = bitcast i32* %dst to i8*
+; WIR: [[SRCCAST:%[0-9]+]] = bitcast i32* %src to i8*
+; WIR: getelementptr inbounds i8, i8* [[SRCCAST]]
+; WIR: getelementptr inbounds i8, i8* [[DSTCAST]]
+}
+
+define i8* @memcpy_known_size(i8* %dst, i8* %src) {
+entry:
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 144, i32 1, i1 false)
+ ret i8* %dst
+
+; Check that calls with compile-time constant size are handled correctly
+; WIR-LABEL: @memcpy_known_size
+; WIR: entry:
+; WIR: br label %load-store-loop
+; WIR: load-store-loop:
+; WIR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ]
+; WIR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
+; WIR: [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]]
+; WIR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
+; WIR: store i8 [[Load]], i8* [[DstGep]]
+; WIR: [[IndexInc]] = add i64 %loop-index, 1
+; WIR: [[Cond:%[0-9]+]] = icmp ult i64 %3, 144
+; WIR: br i1 [[Cond]], label %load-store-loop, label %memcpy-split
}
define i8* @memset_caller(i8* %dst, i32 %c, i64 %n) #0 {
diff --git a/test/CodeGen/PowerPC/PR33636.ll b/test/CodeGen/PowerPC/PR33636.ll
new file mode 100644
index 000000000000..4a1216dd4c11
--- /dev/null
+++ b/test/CodeGen/PowerPC/PR33636.ll
@@ -0,0 +1,702 @@
+; Just a test case for a crash reported in
+; https://bugs.llvm.org/show_bug.cgi?id=33636
+; RUN: llc -mtriple=powerpc64le-unknown-unknown -mcpu=pwr8 < %s | FileCheck %s
+@g_225 = external unnamed_addr global i16, align 2
+@g_756 = external global [6 x i32], align 4
+@g_3456 = external global i32, align 4
+@g_3708 = external global [9 x i32], align 4
+@g_1252 = external global i8*, align 8
+@g_3043 = external global float*, align 8
+
+; Function Attrs: nounwind
+define void @main() {
+ br i1 undef, label %1, label %4
+
+; <label>:1: ; preds = %0
+ br i1 undef, label %2, label %3
+
+; <label>:2: ; preds = %1
+ br label %3
+
+; <label>:3: ; preds = %2, %1
+ br label %4
+
+; <label>:4: ; preds = %3, %0
+ br label %5
+
+; <label>:5: ; preds = %5, %4
+ br i1 undef, label %6, label %5
+
+; <label>:6: ; preds = %5
+ br i1 undef, label %7, label %8
+
+; <label>:7: ; preds = %6
+ br i1 undef, label %70, label %69
+
+; <label>:8: ; preds = %6
+ br i1 undef, label %9, label %50
+
+; <label>:9: ; preds = %8
+ br label %11
+
+; <label>:10: ; preds = %28
+ br i1 undef, label %11, label %12
+
+; <label>:11: ; preds = %10, %9
+ br label %13
+
+; <label>:12: ; preds = %10
+ br label %30
+
+; <label>:13: ; preds = %23, %11
+ br i1 undef, label %17, label %14
+
+; <label>:14: ; preds = %13
+ br i1 undef, label %16, label %15
+
+; <label>:15: ; preds = %14
+ br label %22
+
+; <label>:16: ; preds = %14
+ br label %17
+
+; <label>:17: ; preds = %16, %13
+ br i1 undef, label %18, label %19
+
+; <label>:18: ; preds = %17
+ br label %19
+
+; <label>:19: ; preds = %18, %17
+ br i1 undef, label %48, label %20
+
+; <label>:20: ; preds = %19
+ br i1 undef, label %48, label %21
+
+; <label>:21: ; preds = %20
+ br label %22
+
+; <label>:22: ; preds = %21, %15
+ br i1 undef, label %23, label %24
+
+; <label>:23: ; preds = %22
+ br label %13
+
+; <label>:24: ; preds = %22
+ br i1 undef, label %28, label %25
+
+; <label>:25: ; preds = %24
+ br label %26
+
+; <label>:26: ; preds = %26, %25
+ br i1 undef, label %26, label %27
+
+; <label>:27: ; preds = %26
+ br label %48
+
+; <label>:28: ; preds = %24
+ br i1 undef, label %29, label %10
+
+; <label>:29: ; preds = %28
+ br label %48
+
+; <label>:30: ; preds = %33, %12
+ br i1 undef, label %32, label %33
+
+; <label>:31: ; preds = %33
+ br label %34
+
+; <label>:32: ; preds = %30
+ br label %33
+
+; <label>:33: ; preds = %32, %30
+ br i1 undef, label %30, label %31
+
+; <label>:34: ; preds = %47, %31
+ br i1 undef, label %35, label %36
+
+; <label>:35: ; preds = %34
+ br label %36
+
+; <label>:36: ; preds = %35, %34
+ br label %37
+
+; <label>:37: ; preds = %45, %36
+ br i1 undef, label %40, label %38
+
+; <label>:38: ; preds = %37
+ br i1 undef, label %39, label %46
+
+; <label>:39: ; preds = %38
+ br label %41
+
+; <label>:40: ; preds = %37
+ br label %41
+
+; <label>:41: ; preds = %40, %39
+ br label %42
+
+; <label>:42: ; preds = %44, %41
+ br i1 undef, label %43, label %44
+
+; <label>:43: ; preds = %42
+ br label %44
+
+; <label>:44: ; preds = %43, %42
+ br i1 undef, label %42, label %45
+
+; <label>:45: ; preds = %44
+ br i1 undef, label %37, label %47
+
+; <label>:46: ; preds = %38
+ br label %48
+
+; <label>:47: ; preds = %45
+ br i1 undef, label %34, label %49
+
+; <label>:48: ; preds = %46, %29, %27, %20, %19
+ br label %65
+
+; <label>:49: ; preds = %47
+ br label %58
+
+; <label>:50: ; preds = %8
+ br i1 undef, label %52, label %51
+
+; <label>:51: ; preds = %50
+ br label %57
+
+; <label>:52: ; preds = %50
+ br label %53
+
+; <label>:53: ; preds = %56, %52
+ br i1 undef, label %54, label %59
+
+; <label>:54: ; preds = %53
+ br i1 undef, label %60, label %59
+
+; <label>:55: ; preds = %64
+ br label %56
+
+; <label>:56: ; preds = %64, %55
+ br i1 undef, label %57, label %53
+
+; <label>:57: ; preds = %56, %51
+ br label %58
+
+; <label>:58: ; preds = %57, %49
+ br label %65
+
+; <label>:59: ; preds = %63, %62, %61, %60, %54, %53
+ br label %65
+
+; <label>:60: ; preds = %54
+ br i1 undef, label %61, label %59
+
+; <label>:61: ; preds = %60
+ br i1 undef, label %62, label %59
+
+; <label>:62: ; preds = %61
+ br i1 undef, label %63, label %59
+
+; <label>:63: ; preds = %62
+ br i1 undef, label %64, label %59
+
+; <label>:64: ; preds = %63
+ br i1 undef, label %55, label %56
+
+; <label>:65: ; preds = %59, %58, %48
+ br i1 undef, label %66, label %67
+
+; <label>:66: ; preds = %65
+ br label %67
+
+; <label>:67: ; preds = %66, %65
+ br i1 undef, label %68, label %92
+
+; <label>:68: ; preds = %67
+ br label %92
+
+; <label>:69: ; preds = %7
+ br label %70
+
+; <label>:70: ; preds = %69, %7
+ br i1 undef, label %72, label %71
+
+; <label>:71: ; preds = %70
+ br label %72
+
+; <label>:72: ; preds = %71, %70
+ br i1 undef, label %73, label %74
+
+; <label>:73: ; preds = %72
+ br label %74
+
+; <label>:74: ; preds = %73, %72
+ br i1 undef, label %85, label %75
+
+; <label>:75: ; preds = %74
+ br i1 undef, label %84, label %76
+
+; <label>:76: ; preds = %75
+ br i1 undef, label %78, label %77
+
+; <label>:77: ; preds = %77, %76
+ br i1 undef, label %84, label %77
+
+; <label>:78: ; preds = %76
+ br label %79
+
+; <label>:79: ; preds = %83, %78
+ br i1 undef, label %83, label %80
+
+; <label>:80: ; preds = %79
+ br i1 undef, label %81, label %82
+
+; <label>:81: ; preds = %80
+ br label %83
+
+; <label>:82: ; preds = %80
+ br label %83
+
+; <label>:83: ; preds = %82, %81, %79
+ br i1 undef, label %90, label %79
+
+; <label>:84: ; preds = %77, %75
+ br label %92
+
+; <label>:85: ; preds = %74
+ br i1 undef, label %86, label %88
+
+; <label>:86: ; preds = %85
+ br i1 undef, label %89, label %87
+
+; <label>:87: ; preds = %86
+ br i1 undef, label %89, label %88
+
+; <label>:88: ; preds = %87, %85
+ br label %89
+
+; <label>:89: ; preds = %88, %87, %86
+ br label %92
+
+; <label>:90: ; preds = %83
+ br i1 undef, label %92, label %91
+
+; <label>:91: ; preds = %90
+ br label %92
+
+; <label>:92: ; preds = %91, %90, %89, %84, %68, %67
+ br label %93
+
+; <label>:93: ; preds = %100, %92
+ br label %94
+
+; <label>:94: ; preds = %98, %93
+ br label %95
+
+; <label>:95: ; preds = %97, %94
+ br i1 undef, label %96, label %97
+
+; <label>:96: ; preds = %95
+ br label %97
+
+; <label>:97: ; preds = %96, %95
+ br i1 undef, label %95, label %98
+
+; <label>:98: ; preds = %97
+ store i32 7, i32* getelementptr inbounds ([9 x i32], [9 x i32]* @g_3708, i64 0, i64 7), align 4
+ %99 = load volatile i32, i32* @g_3456, align 4
+ br i1 undef, label %94, label %100
+
+; <label>:100: ; preds = %98
+ br i1 undef, label %93, label %101
+
+; <label>:101: ; preds = %100
+ br label %102
+
+; <label>:102: ; preds = %117, %101
+ br label %103
+
+; <label>:103: ; preds = %109, %102
+ store i8** @g_1252, i8*** undef, align 8
+ br i1 undef, label %105, label %104
+
+; <label>:104: ; preds = %103
+ br label %105
+
+; <label>:105: ; preds = %104, %103
+ %106 = icmp eq i32 0, 0
+ br i1 %106, label %107, label %116
+
+; <label>:107: ; preds = %105
+ br i1 icmp ne (i32* getelementptr inbounds ([6 x i32], [6 x i32]* @g_756, i64 0, i64 0), i32* getelementptr inbounds ([9 x i32], [9 x i32]* @g_3708, i64 0, i64 4)), label %109, label %108
+
+; <label>:108: ; preds = %107
+ br label %109
+
+; <label>:109: ; preds = %108, %107
+ %110 = phi i32 [ sdiv (i32 32, i32 zext (i1 icmp eq (i32* getelementptr inbounds ([6 x i32], [6 x i32]* @g_756, i64 0, i64 0), i32* getelementptr inbounds ([9 x i32], [9 x i32]* @g_3708, i64 0, i64 4)) to i32)), %108 ], [ 32, %107 ]
+ %111 = trunc i32 %110 to i8
+ %112 = icmp ne i8 %111, 0
+ %113 = and i1 %112, icmp eq (i32* getelementptr inbounds ([6 x i32], [6 x i32]* @g_756, i64 0, i64 0), i32* getelementptr inbounds ([9 x i32], [9 x i32]* @g_3708, i64 0, i64 4))
+ %114 = zext i1 %113 to i16
+ store i16 %114, i16* @g_225, align 2
+ %115 = load volatile float*, float** @g_3043, align 8
+ br i1 undef, label %103, label %117
+
+; <label>:116: ; preds = %105
+ br label %119
+
+; <label>:117: ; preds = %109
+ br i1 undef, label %102, label %118
+
+; <label>:118: ; preds = %117
+ br label %119
+
+; <label>:119: ; preds = %118, %116
+ br i1 undef, label %120, label %231
+
+; <label>:120: ; preds = %119
+ br label %232
+
+; <label>:121: ; preds = %230
+ br label %122
+
+; <label>:122: ; preds = %230, %121
+ br i1 undef, label %124, label %123
+
+; <label>:123: ; preds = %122
+ br label %124
+
+; <label>:124: ; preds = %123, %122
+ br i1 undef, label %228, label %225
+
+; <label>:125: ; preds = %218
+ br label %127
+
+; <label>:126: ; preds = %218
+ br label %127
+
+; <label>:127: ; preds = %216, %126, %125
+ br i1 undef, label %204, label %128
+
+; <label>:128: ; preds = %127
+ br label %205
+
+; <label>:129: ; preds = %216
+ br i1 undef, label %131, label %130
+
+; <label>:130: ; preds = %129
+ br label %131
+
+; <label>:131: ; preds = %130, %129
+ br i1 undef, label %133, label %132
+
+; <label>:132: ; preds = %131
+ br label %133
+
+; <label>:133: ; preds = %132, %131
+ br label %134
+
+; <label>:134: ; preds = %203, %133
+ br i1 undef, label %193, label %135
+
+; <label>:135: ; preds = %134
+ br label %194
+
+; <label>:136: ; preds = %203
+ br i1 undef, label %138, label %137
+
+; <label>:137: ; preds = %136
+ br label %138
+
+; <label>:138: ; preds = %137, %136
+ br i1 undef, label %192, label %139
+
+; <label>:139: ; preds = %138
+ br label %191
+
+; <label>:140: ; preds = %191, %190
+ br i1 undef, label %180, label %141
+
+; <label>:141: ; preds = %140
+ br label %181
+
+; <label>:142: ; preds = %190
+ br i1 undef, label %143, label %178
+
+; <label>:143: ; preds = %142
+ br label %179
+
+; <label>:144: ; preds = %179
+ br label %176
+
+; <label>:145: ; preds = %179
+ br label %176
+
+; <label>:146: ; preds = %177, %175, %174
+ br i1 undef, label %165, label %147
+
+; <label>:147: ; preds = %146
+ br label %166
+
+; <label>:148: ; preds = %174
+ br label %149
+
+; <label>:149: ; preds = %164, %148
+ br i1 undef, label %154, label %150
+
+; <label>:150: ; preds = %149
+ br label %155
+
+; <label>:151: ; preds = %164
+ br i1 undef, label %153, label %152
+
+; <label>:152: ; preds = %151
+ br label %153
+
+; <label>:153: ; preds = %152, %151
+ ret void
+
+; <label>:154: ; preds = %149
+ br label %155
+
+; <label>:155: ; preds = %154, %150
+ br i1 undef, label %157, label %156
+
+; <label>:156: ; preds = %155
+ br label %158
+
+; <label>:157: ; preds = %155
+ br label %158
+
+; <label>:158: ; preds = %157, %156
+ br i1 undef, label %160, label %159
+
+; <label>:159: ; preds = %158
+ br label %161
+
+; <label>:160: ; preds = %158
+ br label %161
+
+; <label>:161: ; preds = %160, %159
+ br i1 undef, label %163, label %162
+
+; <label>:162: ; preds = %161
+ br label %164
+
+; <label>:163: ; preds = %161
+ br label %164
+
+; <label>:164: ; preds = %163, %162
+ br i1 undef, label %151, label %149
+
+; <label>:165: ; preds = %146
+ br label %166
+
+; <label>:166: ; preds = %165, %147
+ br i1 undef, label %168, label %167
+
+; <label>:167: ; preds = %166
+ br label %169
+
+; <label>:168: ; preds = %166
+ br label %169
+
+; <label>:169: ; preds = %168, %167
+ br i1 undef, label %171, label %170
+
+; <label>:170: ; preds = %169
+ br label %172
+
+; <label>:171: ; preds = %169
+ br label %172
+
+; <label>:172: ; preds = %171, %170
+ br i1 undef, label %174, label %173
+
+; <label>:173: ; preds = %172
+ br label %174
+
+; <label>:174: ; preds = %173, %172
+ br i1 undef, label %148, label %146
+
+; <label>:175: ; preds = %176
+ br label %146
+
+; <label>:176: ; preds = %145, %144
+ br i1 undef, label %177, label %175
+
+; <label>:177: ; preds = %176
+ br label %146
+
+; <label>:178: ; preds = %142
+ br label %179
+
+; <label>:179: ; preds = %178, %143
+ br i1 undef, label %145, label %144
+
+; <label>:180: ; preds = %140
+ br label %181
+
+; <label>:181: ; preds = %180, %141
+ br i1 undef, label %183, label %182
+
+; <label>:182: ; preds = %181
+ br label %184
+
+; <label>:183: ; preds = %181
+ br label %184
+
+; <label>:184: ; preds = %183, %182
+ br i1 undef, label %186, label %185
+
+; <label>:185: ; preds = %184
+ br label %187
+
+; <label>:186: ; preds = %184
+ br label %187
+
+; <label>:187: ; preds = %186, %185
+ br i1 undef, label %189, label %188
+
+; <label>:188: ; preds = %187
+ br label %190
+
+; <label>:189: ; preds = %187
+ br label %190
+
+; <label>:190: ; preds = %189, %188
+ br i1 undef, label %142, label %140
+
+; <label>:191: ; preds = %192, %139
+ br label %140
+
+; <label>:192: ; preds = %138
+ br label %191
+
+; <label>:193: ; preds = %134
+ br label %194
+
+; <label>:194: ; preds = %193, %135
+ br i1 undef, label %196, label %195
+
+; <label>:195: ; preds = %194
+ br label %197
+
+; <label>:196: ; preds = %194
+ br label %197
+
+; <label>:197: ; preds = %196, %195
+ br i1 undef, label %199, label %198
+
+; <label>:198: ; preds = %197
+ br label %200
+
+; <label>:199: ; preds = %197
+ br label %200
+
+; <label>:200: ; preds = %199, %198
+ br i1 undef, label %202, label %201
+
+; <label>:201: ; preds = %200
+ br label %203
+
+; <label>:202: ; preds = %200
+ br label %203
+
+; <label>:203: ; preds = %202, %201
+ br i1 undef, label %136, label %134
+
+; <label>:204: ; preds = %127
+ br label %205
+
+; <label>:205: ; preds = %204, %128
+ br i1 undef, label %207, label %206
+
+; <label>:206: ; preds = %205
+ br label %208
+
+; <label>:207: ; preds = %205
+ br label %208
+
+; <label>:208: ; preds = %207, %206
+ br i1 undef, label %210, label %209
+
+; <label>:209: ; preds = %208
+ br label %211
+
+; <label>:210: ; preds = %208
+ br label %211
+
+; <label>:211: ; preds = %210, %209
+ br i1 undef, label %213, label %212
+
+; <label>:212: ; preds = %211
+ br label %214
+
+; <label>:213: ; preds = %211
+ br label %214
+
+; <label>:214: ; preds = %213, %212
+ br i1 undef, label %216, label %215
+
+; <label>:215: ; preds = %214
+ br label %216
+
+; <label>:216: ; preds = %215, %214
+ br i1 undef, label %129, label %127
+
+; <label>:217: ; preds = %220
+ br label %218
+
+; <label>:218: ; preds = %221, %217
+ br i1 undef, label %126, label %125
+
+; <label>:219: ; preds = %223
+ br label %220
+
+; <label>:220: ; preds = %224, %219
+ br i1 undef, label %221, label %217
+
+; <label>:221: ; preds = %220
+ br label %218
+
+; <label>:222: ; preds = %226
+ br label %223
+
+; <label>:223: ; preds = %227, %222
+ br i1 undef, label %224, label %219
+
+; <label>:224: ; preds = %223
+ br label %220
+
+; <label>:225: ; preds = %124
+ br label %226
+
+; <label>:226: ; preds = %228, %225
+ br i1 undef, label %227, label %222
+
+; <label>:227: ; preds = %226
+ br label %223
+
+; <label>:228: ; preds = %124
+ br label %226
+
+; <label>:229: ; preds = %232
+ br label %230
+
+; <label>:230: ; preds = %233, %229
+ br i1 undef, label %122, label %121
+
+; <label>:231: ; preds = %119
+ br label %232
+
+; <label>:232: ; preds = %231, %120
+ br i1 undef, label %233, label %229
+
+; <label>:233: ; preds = %232
+ br label %230
+
+; CHECK: blr
+}
diff --git a/test/CodeGen/PowerPC/atomics-regression.ll b/test/CodeGen/PowerPC/atomics-regression.ll
index d57b3a203791..0c7a31d16b19 100644
--- a/test/CodeGen/PowerPC/atomics-regression.ll
+++ b/test/CodeGen/PowerPC/atomics-regression.ll
@@ -370,7 +370,7 @@ define void @test36() {
; PPC64LE: # BB#0:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- fence singlethread acquire
+ fence syncscope("singlethread") acquire
ret void
}
@@ -379,7 +379,7 @@ define void @test37() {
; PPC64LE: # BB#0:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- fence singlethread release
+ fence syncscope("singlethread") release
ret void
}
@@ -388,7 +388,7 @@ define void @test38() {
; PPC64LE: # BB#0:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- fence singlethread acq_rel
+ fence syncscope("singlethread") acq_rel
ret void
}
@@ -397,7 +397,7 @@ define void @test39() {
; PPC64LE: # BB#0:
; PPC64LE-NEXT: sync
; PPC64LE-NEXT: blr
- fence singlethread seq_cst
+ fence syncscope("singlethread") seq_cst
ret void
}
@@ -1273,7 +1273,7 @@ define void @test80(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: stbcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread monotonic monotonic
+ %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") monotonic monotonic
ret void
}
@@ -1294,7 +1294,7 @@ define void @test81(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-NEXT: stbcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread acquire monotonic
+ %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") acquire monotonic
ret void
}
@@ -1315,7 +1315,7 @@ define void @test82(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-NEXT: stbcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread acquire acquire
+ %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") acquire acquire
ret void
}
@@ -1336,7 +1336,7 @@ define void @test83(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: stbcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread release monotonic
+ %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") release monotonic
ret void
}
@@ -1357,7 +1357,7 @@ define void @test84(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: stbcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread release acquire
+ %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") release acquire
ret void
}
@@ -1379,7 +1379,7 @@ define void @test85(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-NEXT: stbcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread acq_rel monotonic
+ %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") acq_rel monotonic
ret void
}
@@ -1401,7 +1401,7 @@ define void @test86(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-NEXT: stbcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread acq_rel acquire
+ %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") acq_rel acquire
ret void
}
@@ -1423,7 +1423,7 @@ define void @test87(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-NEXT: stbcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread seq_cst monotonic
+ %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") seq_cst monotonic
ret void
}
@@ -1445,7 +1445,7 @@ define void @test88(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-NEXT: stbcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread seq_cst acquire
+ %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") seq_cst acquire
ret void
}
@@ -1467,7 +1467,7 @@ define void @test89(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-NEXT: stbcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread seq_cst seq_cst
+ %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") seq_cst seq_cst
ret void
}
@@ -1487,7 +1487,7 @@ define void @test90(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: sthcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread monotonic monotonic
+ %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") monotonic monotonic
ret void
}
@@ -1508,7 +1508,7 @@ define void @test91(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-NEXT: sthcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread acquire monotonic
+ %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") acquire monotonic
ret void
}
@@ -1529,7 +1529,7 @@ define void @test92(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-NEXT: sthcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread acquire acquire
+ %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") acquire acquire
ret void
}
@@ -1550,7 +1550,7 @@ define void @test93(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: sthcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread release monotonic
+ %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") release monotonic
ret void
}
@@ -1571,7 +1571,7 @@ define void @test94(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: sthcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread release acquire
+ %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") release acquire
ret void
}
@@ -1593,7 +1593,7 @@ define void @test95(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-NEXT: sthcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread acq_rel monotonic
+ %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") acq_rel monotonic
ret void
}
@@ -1615,7 +1615,7 @@ define void @test96(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-NEXT: sthcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread acq_rel acquire
+ %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") acq_rel acquire
ret void
}
@@ -1637,7 +1637,7 @@ define void @test97(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-NEXT: sthcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread seq_cst monotonic
+ %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") seq_cst monotonic
ret void
}
@@ -1659,7 +1659,7 @@ define void @test98(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-NEXT: sthcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread seq_cst acquire
+ %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") seq_cst acquire
ret void
}
@@ -1681,7 +1681,7 @@ define void @test99(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-NEXT: sthcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread seq_cst seq_cst
+ %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") seq_cst seq_cst
ret void
}
@@ -1701,7 +1701,7 @@ define void @test100(i32* %ptr, i32 %cmp, i32 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: stwcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread monotonic monotonic
+ %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") monotonic monotonic
ret void
}
@@ -1722,7 +1722,7 @@ define void @test101(i32* %ptr, i32 %cmp, i32 %val) {
; PPC64LE-NEXT: stwcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread acquire monotonic
+ %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") acquire monotonic
ret void
}
@@ -1743,7 +1743,7 @@ define void @test102(i32* %ptr, i32 %cmp, i32 %val) {
; PPC64LE-NEXT: stwcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread acquire acquire
+ %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") acquire acquire
ret void
}
@@ -1764,7 +1764,7 @@ define void @test103(i32* %ptr, i32 %cmp, i32 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: stwcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread release monotonic
+ %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") release monotonic
ret void
}
@@ -1785,7 +1785,7 @@ define void @test104(i32* %ptr, i32 %cmp, i32 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: stwcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread release acquire
+ %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") release acquire
ret void
}
@@ -1807,7 +1807,7 @@ define void @test105(i32* %ptr, i32 %cmp, i32 %val) {
; PPC64LE-NEXT: stwcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread acq_rel monotonic
+ %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") acq_rel monotonic
ret void
}
@@ -1829,7 +1829,7 @@ define void @test106(i32* %ptr, i32 %cmp, i32 %val) {
; PPC64LE-NEXT: stwcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread acq_rel acquire
+ %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") acq_rel acquire
ret void
}
@@ -1851,7 +1851,7 @@ define void @test107(i32* %ptr, i32 %cmp, i32 %val) {
; PPC64LE-NEXT: stwcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread seq_cst monotonic
+ %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") seq_cst monotonic
ret void
}
@@ -1873,7 +1873,7 @@ define void @test108(i32* %ptr, i32 %cmp, i32 %val) {
; PPC64LE-NEXT: stwcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread seq_cst acquire
+ %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") seq_cst acquire
ret void
}
@@ -1895,7 +1895,7 @@ define void @test109(i32* %ptr, i32 %cmp, i32 %val) {
; PPC64LE-NEXT: stwcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread seq_cst seq_cst
+ %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") seq_cst seq_cst
ret void
}
@@ -1915,7 +1915,7 @@ define void @test110(i64* %ptr, i64 %cmp, i64 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: stdcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread monotonic monotonic
+ %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") monotonic monotonic
ret void
}
@@ -1936,7 +1936,7 @@ define void @test111(i64* %ptr, i64 %cmp, i64 %val) {
; PPC64LE-NEXT: stdcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread acquire monotonic
+ %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") acquire monotonic
ret void
}
@@ -1957,7 +1957,7 @@ define void @test112(i64* %ptr, i64 %cmp, i64 %val) {
; PPC64LE-NEXT: stdcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread acquire acquire
+ %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") acquire acquire
ret void
}
@@ -1978,7 +1978,7 @@ define void @test113(i64* %ptr, i64 %cmp, i64 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: stdcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread release monotonic
+ %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") release monotonic
ret void
}
@@ -1999,7 +1999,7 @@ define void @test114(i64* %ptr, i64 %cmp, i64 %val) {
; PPC64LE-NEXT: # BB#3:
; PPC64LE-NEXT: stdcx. 6, 0, 3
; PPC64LE-NEXT: blr
- %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread release acquire
+ %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") release acquire
ret void
}
@@ -2021,7 +2021,7 @@ define void @test115(i64* %ptr, i64 %cmp, i64 %val) {
; PPC64LE-NEXT: stdcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread acq_rel monotonic
+ %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") acq_rel monotonic
ret void
}
@@ -2043,7 +2043,7 @@ define void @test116(i64* %ptr, i64 %cmp, i64 %val) {
; PPC64LE-NEXT: stdcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread acq_rel acquire
+ %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") acq_rel acquire
ret void
}
@@ -2065,7 +2065,7 @@ define void @test117(i64* %ptr, i64 %cmp, i64 %val) {
; PPC64LE-NEXT: stdcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread seq_cst monotonic
+ %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") seq_cst monotonic
ret void
}
@@ -2087,7 +2087,7 @@ define void @test118(i64* %ptr, i64 %cmp, i64 %val) {
; PPC64LE-NEXT: stdcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread seq_cst acquire
+ %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") seq_cst acquire
ret void
}
@@ -2109,7 +2109,7 @@ define void @test119(i64* %ptr, i64 %cmp, i64 %val) {
; PPC64LE-NEXT: stdcx. 6, 0, 3
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread seq_cst seq_cst
+ %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") seq_cst seq_cst
ret void
}
@@ -5847,7 +5847,7 @@ define i8 @test340(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw xchg i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -5862,7 +5862,7 @@ define i8 @test341(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw xchg i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -5877,7 +5877,7 @@ define i8 @test342(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw xchg i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -5893,7 +5893,7 @@ define i8 @test343(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw xchg i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -5909,7 +5909,7 @@ define i8 @test344(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw xchg i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -5923,7 +5923,7 @@ define i16 @test345(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw xchg i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -5938,7 +5938,7 @@ define i16 @test346(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw xchg i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -5953,7 +5953,7 @@ define i16 @test347(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw xchg i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -5969,7 +5969,7 @@ define i16 @test348(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw xchg i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -5985,7 +5985,7 @@ define i16 @test349(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw xchg i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -5999,7 +5999,7 @@ define i32 @test350(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw xchg i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -6014,7 +6014,7 @@ define i32 @test351(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw xchg i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -6029,7 +6029,7 @@ define i32 @test352(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw xchg i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -6045,7 +6045,7 @@ define i32 @test353(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw xchg i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -6061,7 +6061,7 @@ define i32 @test354(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw xchg i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -6075,7 +6075,7 @@ define i64 @test355(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw xchg i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -6090,7 +6090,7 @@ define i64 @test356(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw xchg i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -6105,7 +6105,7 @@ define i64 @test357(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw xchg i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -6121,7 +6121,7 @@ define i64 @test358(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw xchg i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -6137,7 +6137,7 @@ define i64 @test359(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xchg i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw xchg i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
@@ -6152,7 +6152,7 @@ define i8 @test360(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw add i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -6168,7 +6168,7 @@ define i8 @test361(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw add i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -6184,7 +6184,7 @@ define i8 @test362(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw add i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -6201,7 +6201,7 @@ define i8 @test363(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw add i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -6218,7 +6218,7 @@ define i8 @test364(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw add i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -6233,7 +6233,7 @@ define i16 @test365(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw add i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -6249,7 +6249,7 @@ define i16 @test366(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw add i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -6265,7 +6265,7 @@ define i16 @test367(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw add i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -6282,7 +6282,7 @@ define i16 @test368(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw add i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -6299,7 +6299,7 @@ define i16 @test369(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw add i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -6314,7 +6314,7 @@ define i32 @test370(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw add i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -6330,7 +6330,7 @@ define i32 @test371(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw add i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -6346,7 +6346,7 @@ define i32 @test372(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw add i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -6363,7 +6363,7 @@ define i32 @test373(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw add i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -6380,7 +6380,7 @@ define i32 @test374(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw add i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -6395,7 +6395,7 @@ define i64 @test375(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw add i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -6411,7 +6411,7 @@ define i64 @test376(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw add i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -6427,7 +6427,7 @@ define i64 @test377(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw add i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -6444,7 +6444,7 @@ define i64 @test378(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw add i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -6461,7 +6461,7 @@ define i64 @test379(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw add i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw add i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
@@ -6476,7 +6476,7 @@ define i8 @test380(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw sub i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -6492,7 +6492,7 @@ define i8 @test381(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw sub i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -6508,7 +6508,7 @@ define i8 @test382(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw sub i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -6525,7 +6525,7 @@ define i8 @test383(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw sub i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -6542,7 +6542,7 @@ define i8 @test384(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw sub i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -6557,7 +6557,7 @@ define i16 @test385(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw sub i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -6573,7 +6573,7 @@ define i16 @test386(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw sub i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -6589,7 +6589,7 @@ define i16 @test387(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw sub i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -6606,7 +6606,7 @@ define i16 @test388(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw sub i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -6623,7 +6623,7 @@ define i16 @test389(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw sub i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -6638,7 +6638,7 @@ define i32 @test390(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw sub i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -6654,7 +6654,7 @@ define i32 @test391(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw sub i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -6670,7 +6670,7 @@ define i32 @test392(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw sub i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -6687,7 +6687,7 @@ define i32 @test393(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw sub i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -6704,7 +6704,7 @@ define i32 @test394(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw sub i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -6719,7 +6719,7 @@ define i64 @test395(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw sub i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -6735,7 +6735,7 @@ define i64 @test396(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw sub i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -6751,7 +6751,7 @@ define i64 @test397(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw sub i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -6768,7 +6768,7 @@ define i64 @test398(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw sub i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -6785,7 +6785,7 @@ define i64 @test399(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw sub i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw sub i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
@@ -6800,7 +6800,7 @@ define i8 @test400(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw and i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -6816,7 +6816,7 @@ define i8 @test401(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw and i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -6832,7 +6832,7 @@ define i8 @test402(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw and i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -6849,7 +6849,7 @@ define i8 @test403(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw and i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -6866,7 +6866,7 @@ define i8 @test404(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw and i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -6881,7 +6881,7 @@ define i16 @test405(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw and i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -6897,7 +6897,7 @@ define i16 @test406(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw and i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -6913,7 +6913,7 @@ define i16 @test407(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw and i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -6930,7 +6930,7 @@ define i16 @test408(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw and i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -6947,7 +6947,7 @@ define i16 @test409(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw and i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -6962,7 +6962,7 @@ define i32 @test410(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw and i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -6978,7 +6978,7 @@ define i32 @test411(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw and i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -6994,7 +6994,7 @@ define i32 @test412(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw and i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -7011,7 +7011,7 @@ define i32 @test413(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw and i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -7028,7 +7028,7 @@ define i32 @test414(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw and i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -7043,7 +7043,7 @@ define i64 @test415(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw and i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -7059,7 +7059,7 @@ define i64 @test416(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw and i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -7075,7 +7075,7 @@ define i64 @test417(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw and i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -7092,7 +7092,7 @@ define i64 @test418(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw and i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -7109,7 +7109,7 @@ define i64 @test419(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw and i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw and i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
@@ -7124,7 +7124,7 @@ define i8 @test420(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw nand i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -7140,7 +7140,7 @@ define i8 @test421(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw nand i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -7156,7 +7156,7 @@ define i8 @test422(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw nand i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -7173,7 +7173,7 @@ define i8 @test423(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw nand i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -7190,7 +7190,7 @@ define i8 @test424(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw nand i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -7205,7 +7205,7 @@ define i16 @test425(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw nand i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -7221,7 +7221,7 @@ define i16 @test426(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw nand i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -7237,7 +7237,7 @@ define i16 @test427(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw nand i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -7254,7 +7254,7 @@ define i16 @test428(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw nand i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -7271,7 +7271,7 @@ define i16 @test429(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw nand i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -7286,7 +7286,7 @@ define i32 @test430(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw nand i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -7302,7 +7302,7 @@ define i32 @test431(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw nand i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -7318,7 +7318,7 @@ define i32 @test432(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw nand i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -7335,7 +7335,7 @@ define i32 @test433(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw nand i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -7352,7 +7352,7 @@ define i32 @test434(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw nand i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -7367,7 +7367,7 @@ define i64 @test435(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw nand i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -7383,7 +7383,7 @@ define i64 @test436(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw nand i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -7399,7 +7399,7 @@ define i64 @test437(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw nand i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -7416,7 +7416,7 @@ define i64 @test438(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw nand i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -7433,7 +7433,7 @@ define i64 @test439(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw nand i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw nand i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
@@ -7448,7 +7448,7 @@ define i8 @test440(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw or i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -7464,7 +7464,7 @@ define i8 @test441(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw or i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -7480,7 +7480,7 @@ define i8 @test442(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw or i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -7497,7 +7497,7 @@ define i8 @test443(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw or i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -7514,7 +7514,7 @@ define i8 @test444(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw or i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -7529,7 +7529,7 @@ define i16 @test445(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw or i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -7545,7 +7545,7 @@ define i16 @test446(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw or i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -7561,7 +7561,7 @@ define i16 @test447(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw or i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -7578,7 +7578,7 @@ define i16 @test448(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw or i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -7595,7 +7595,7 @@ define i16 @test449(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw or i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -7610,7 +7610,7 @@ define i32 @test450(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw or i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -7626,7 +7626,7 @@ define i32 @test451(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw or i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -7642,7 +7642,7 @@ define i32 @test452(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw or i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -7659,7 +7659,7 @@ define i32 @test453(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw or i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -7676,7 +7676,7 @@ define i32 @test454(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw or i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -7691,7 +7691,7 @@ define i64 @test455(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw or i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -7707,7 +7707,7 @@ define i64 @test456(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw or i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -7723,7 +7723,7 @@ define i64 @test457(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw or i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -7740,7 +7740,7 @@ define i64 @test458(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw or i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -7757,7 +7757,7 @@ define i64 @test459(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw or i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw or i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
@@ -7772,7 +7772,7 @@ define i8 @test460(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw xor i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -7788,7 +7788,7 @@ define i8 @test461(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw xor i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -7804,7 +7804,7 @@ define i8 @test462(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw xor i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -7821,7 +7821,7 @@ define i8 @test463(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw xor i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -7838,7 +7838,7 @@ define i8 @test464(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw xor i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -7853,7 +7853,7 @@ define i16 @test465(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw xor i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -7869,7 +7869,7 @@ define i16 @test466(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw xor i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -7885,7 +7885,7 @@ define i16 @test467(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw xor i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -7902,7 +7902,7 @@ define i16 @test468(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw xor i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -7919,7 +7919,7 @@ define i16 @test469(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw xor i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -7934,7 +7934,7 @@ define i32 @test470(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw xor i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -7950,7 +7950,7 @@ define i32 @test471(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw xor i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -7966,7 +7966,7 @@ define i32 @test472(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw xor i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -7983,7 +7983,7 @@ define i32 @test473(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw xor i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -8000,7 +8000,7 @@ define i32 @test474(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw xor i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -8015,7 +8015,7 @@ define i64 @test475(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw xor i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -8031,7 +8031,7 @@ define i64 @test476(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw xor i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -8047,7 +8047,7 @@ define i64 @test477(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: # BB#2:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw xor i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -8064,7 +8064,7 @@ define i64 @test478(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw xor i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -8081,7 +8081,7 @@ define i64 @test479(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw xor i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw xor i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
@@ -8099,7 +8099,7 @@ define i8 @test480(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB480_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw max i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -8118,7 +8118,7 @@ define i8 @test481(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB481_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw max i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -8137,7 +8137,7 @@ define i8 @test482(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB482_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw max i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -8157,7 +8157,7 @@ define i8 @test483(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw max i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -8177,7 +8177,7 @@ define i8 @test484(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw max i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -8195,7 +8195,7 @@ define i16 @test485(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB485_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw max i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -8214,7 +8214,7 @@ define i16 @test486(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB486_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw max i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -8233,7 +8233,7 @@ define i16 @test487(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB487_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw max i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -8253,7 +8253,7 @@ define i16 @test488(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw max i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -8273,7 +8273,7 @@ define i16 @test489(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw max i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -8290,7 +8290,7 @@ define i32 @test490(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB490_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw max i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -8308,7 +8308,7 @@ define i32 @test491(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB491_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw max i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -8326,7 +8326,7 @@ define i32 @test492(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB492_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw max i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -8345,7 +8345,7 @@ define i32 @test493(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw max i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -8364,7 +8364,7 @@ define i32 @test494(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw max i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -8381,7 +8381,7 @@ define i64 @test495(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB495_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw max i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -8399,7 +8399,7 @@ define i64 @test496(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB496_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw max i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -8417,7 +8417,7 @@ define i64 @test497(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB497_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw max i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -8436,7 +8436,7 @@ define i64 @test498(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw max i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -8455,7 +8455,7 @@ define i64 @test499(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw max i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw max i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
@@ -8473,7 +8473,7 @@ define i8 @test500(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB500_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw min i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -8492,7 +8492,7 @@ define i8 @test501(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB501_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw min i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -8511,7 +8511,7 @@ define i8 @test502(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB502_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw min i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -8531,7 +8531,7 @@ define i8 @test503(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw min i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -8551,7 +8551,7 @@ define i8 @test504(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw min i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -8569,7 +8569,7 @@ define i16 @test505(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB505_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw min i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -8588,7 +8588,7 @@ define i16 @test506(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB506_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw min i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -8607,7 +8607,7 @@ define i16 @test507(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB507_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw min i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -8627,7 +8627,7 @@ define i16 @test508(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw min i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -8647,7 +8647,7 @@ define i16 @test509(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw min i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -8664,7 +8664,7 @@ define i32 @test510(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB510_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw min i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -8682,7 +8682,7 @@ define i32 @test511(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB511_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw min i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -8700,7 +8700,7 @@ define i32 @test512(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB512_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw min i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -8719,7 +8719,7 @@ define i32 @test513(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw min i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -8738,7 +8738,7 @@ define i32 @test514(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw min i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -8755,7 +8755,7 @@ define i64 @test515(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB515_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw min i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -8773,7 +8773,7 @@ define i64 @test516(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB516_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw min i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -8791,7 +8791,7 @@ define i64 @test517(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB517_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw min i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -8810,7 +8810,7 @@ define i64 @test518(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw min i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -8829,7 +8829,7 @@ define i64 @test519(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw min i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw min i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
@@ -8846,7 +8846,7 @@ define i8 @test520(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB520_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw umax i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -8864,7 +8864,7 @@ define i8 @test521(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB521_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw umax i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -8882,7 +8882,7 @@ define i8 @test522(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB522_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw umax i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -8901,7 +8901,7 @@ define i8 @test523(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw umax i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -8920,7 +8920,7 @@ define i8 @test524(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw umax i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -8937,7 +8937,7 @@ define i16 @test525(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB525_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw umax i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -8955,7 +8955,7 @@ define i16 @test526(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB526_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw umax i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -8973,7 +8973,7 @@ define i16 @test527(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB527_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw umax i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -8992,7 +8992,7 @@ define i16 @test528(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw umax i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -9011,7 +9011,7 @@ define i16 @test529(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw umax i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -9028,7 +9028,7 @@ define i32 @test530(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB530_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw umax i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -9046,7 +9046,7 @@ define i32 @test531(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB531_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw umax i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -9064,7 +9064,7 @@ define i32 @test532(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB532_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw umax i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -9083,7 +9083,7 @@ define i32 @test533(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw umax i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -9102,7 +9102,7 @@ define i32 @test534(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw umax i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -9119,7 +9119,7 @@ define i64 @test535(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB535_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw umax i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -9137,7 +9137,7 @@ define i64 @test536(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB536_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw umax i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -9155,7 +9155,7 @@ define i64 @test537(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB537_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw umax i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -9174,7 +9174,7 @@ define i64 @test538(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw umax i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -9193,7 +9193,7 @@ define i64 @test539(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umax i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw umax i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
@@ -9210,7 +9210,7 @@ define i8 @test540(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB540_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i8* %ptr, i8 %val singlethread monotonic
+ %ret = atomicrmw umin i8* %ptr, i8 %val syncscope("singlethread") monotonic
ret i8 %ret
}
@@ -9228,7 +9228,7 @@ define i8 @test541(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB541_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i8* %ptr, i8 %val singlethread acquire
+ %ret = atomicrmw umin i8* %ptr, i8 %val syncscope("singlethread") acquire
ret i8 %ret
}
@@ -9246,7 +9246,7 @@ define i8 @test542(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: .LBB542_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i8* %ptr, i8 %val singlethread release
+ %ret = atomicrmw umin i8* %ptr, i8 %val syncscope("singlethread") release
ret i8 %ret
}
@@ -9265,7 +9265,7 @@ define i8 @test543(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i8* %ptr, i8 %val singlethread acq_rel
+ %ret = atomicrmw umin i8* %ptr, i8 %val syncscope("singlethread") acq_rel
ret i8 %ret
}
@@ -9284,7 +9284,7 @@ define i8 @test544(i8* %ptr, i8 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i8* %ptr, i8 %val singlethread seq_cst
+ %ret = atomicrmw umin i8* %ptr, i8 %val syncscope("singlethread") seq_cst
ret i8 %ret
}
@@ -9301,7 +9301,7 @@ define i16 @test545(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB545_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i16* %ptr, i16 %val singlethread monotonic
+ %ret = atomicrmw umin i16* %ptr, i16 %val syncscope("singlethread") monotonic
ret i16 %ret
}
@@ -9319,7 +9319,7 @@ define i16 @test546(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB546_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i16* %ptr, i16 %val singlethread acquire
+ %ret = atomicrmw umin i16* %ptr, i16 %val syncscope("singlethread") acquire
ret i16 %ret
}
@@ -9337,7 +9337,7 @@ define i16 @test547(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: .LBB547_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i16* %ptr, i16 %val singlethread release
+ %ret = atomicrmw umin i16* %ptr, i16 %val syncscope("singlethread") release
ret i16 %ret
}
@@ -9356,7 +9356,7 @@ define i16 @test548(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i16* %ptr, i16 %val singlethread acq_rel
+ %ret = atomicrmw umin i16* %ptr, i16 %val syncscope("singlethread") acq_rel
ret i16 %ret
}
@@ -9375,7 +9375,7 @@ define i16 @test549(i16* %ptr, i16 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i16* %ptr, i16 %val singlethread seq_cst
+ %ret = atomicrmw umin i16* %ptr, i16 %val syncscope("singlethread") seq_cst
ret i16 %ret
}
@@ -9392,7 +9392,7 @@ define i32 @test550(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB550_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i32* %ptr, i32 %val singlethread monotonic
+ %ret = atomicrmw umin i32* %ptr, i32 %val syncscope("singlethread") monotonic
ret i32 %ret
}
@@ -9410,7 +9410,7 @@ define i32 @test551(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB551_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i32* %ptr, i32 %val singlethread acquire
+ %ret = atomicrmw umin i32* %ptr, i32 %val syncscope("singlethread") acquire
ret i32 %ret
}
@@ -9428,7 +9428,7 @@ define i32 @test552(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: .LBB552_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i32* %ptr, i32 %val singlethread release
+ %ret = atomicrmw umin i32* %ptr, i32 %val syncscope("singlethread") release
ret i32 %ret
}
@@ -9447,7 +9447,7 @@ define i32 @test553(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i32* %ptr, i32 %val singlethread acq_rel
+ %ret = atomicrmw umin i32* %ptr, i32 %val syncscope("singlethread") acq_rel
ret i32 %ret
}
@@ -9466,7 +9466,7 @@ define i32 @test554(i32* %ptr, i32 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i32* %ptr, i32 %val singlethread seq_cst
+ %ret = atomicrmw umin i32* %ptr, i32 %val syncscope("singlethread") seq_cst
ret i32 %ret
}
@@ -9483,7 +9483,7 @@ define i64 @test555(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB555_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i64* %ptr, i64 %val singlethread monotonic
+ %ret = atomicrmw umin i64* %ptr, i64 %val syncscope("singlethread") monotonic
ret i64 %ret
}
@@ -9501,7 +9501,7 @@ define i64 @test556(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB556_3:
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i64* %ptr, i64 %val singlethread acquire
+ %ret = atomicrmw umin i64* %ptr, i64 %val syncscope("singlethread") acquire
ret i64 %ret
}
@@ -9519,7 +9519,7 @@ define i64 @test557(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: .LBB557_3:
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i64* %ptr, i64 %val singlethread release
+ %ret = atomicrmw umin i64* %ptr, i64 %val syncscope("singlethread") release
ret i64 %ret
}
@@ -9538,7 +9538,7 @@ define i64 @test558(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i64* %ptr, i64 %val singlethread acq_rel
+ %ret = atomicrmw umin i64* %ptr, i64 %val syncscope("singlethread") acq_rel
ret i64 %ret
}
@@ -9557,7 +9557,7 @@ define i64 @test559(i64* %ptr, i64 %val) {
; PPC64LE-NEXT: mr 3, 5
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: blr
- %ret = atomicrmw umin i64* %ptr, i64 %val singlethread seq_cst
+ %ret = atomicrmw umin i64* %ptr, i64 %val syncscope("singlethread") seq_cst
ret i64 %ret
}
diff --git a/test/CodeGen/PowerPC/bitreverse.ll b/test/CodeGen/PowerPC/bitreverse.ll
deleted file mode 100644
index dca7340d035d..000000000000
--- a/test/CodeGen/PowerPC/bitreverse.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc -verify-machineinstrs -march=ppc64 %s -o - | FileCheck %s
-
-; These tests just check that the plumbing is in place for @llvm.bitreverse. The
-; actual output is massive at the moment as llvm.bitreverse is not yet legal.
-
-declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) readnone
-
-define <2 x i16> @f(<2 x i16> %a) {
-; CHECK-LABEL: f:
-; CHECK: rlwinm
- %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
- ret <2 x i16> %b
-}
-
-declare i8 @llvm.bitreverse.i8(i8) readnone
-
-define i8 @g(i8 %a) {
-; CHECK-LABEL: g:
-; CHECK: rlwinm
-; CHECK: rlwimi
- %b = call i8 @llvm.bitreverse.i8(i8 %a)
- ret i8 %b
-}
diff --git a/test/CodeGen/PowerPC/build-vector-tests.ll b/test/CodeGen/PowerPC/build-vector-tests.ll
index c42f677d17ab..60bec4d18f12 100644
--- a/test/CodeGen/PowerPC/build-vector-tests.ll
+++ b/test/CodeGen/PowerPC/build-vector-tests.ll
@@ -1028,7 +1028,7 @@ entry:
; P9LE: vperm
; P9LE: blr
; P8BE: sldi {{r[0-9]+}}, r4, 2
-; P8BE-DAG: lxvw4x {{v[0-9]+}}, r3,
+; P8BE-DAG: lxvw4x {{v[0-9]+}}, 0, r3
; P8BE-DAG: lxvw4x
; P8BE: vperm
; P8BE: blr
@@ -2187,7 +2187,7 @@ entry:
; P9LE: vperm
; P9LE: blr
; P8BE-DAG: sldi {{r[0-9]+}}, r4, 2
-; P8BE-DAG: lxvw4x {{v[0-9]+}}, r3
+; P8BE-DAG: lxvw4x {{v[0-9]+}}, 0, r3
; P8BE-DAG: lxvw4x
; P8BE: vperm
; P8BE: blr
diff --git a/test/CodeGen/PowerPC/ppc-ctr-dead-code.ll b/test/CodeGen/PowerPC/ppc-ctr-dead-code.ll
new file mode 100644
index 000000000000..71755f722cb2
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc-ctr-dead-code.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr9 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s
+
+; Function Attrs: norecurse nounwind readonly
+define signext i32 @limit_loop(i32 signext %iters, i32* nocapture readonly %vec, i32 signext %limit) local_unnamed_addr {
+entry:
+ %cmp5 = icmp sgt i32 %iters, 0
+ br i1 %cmp5, label %for.body.preheader, label %cleanup
+
+for.body.preheader: ; preds = %entry
+ %0 = sext i32 %iters to i64
+ br label %for.body
+
+for.cond: ; preds = %for.body
+ %cmp = icmp slt i64 %indvars.iv.next, %0
+ br i1 %cmp, label %for.body, label %cleanup
+
+for.body: ; preds = %for.body.preheader, %for.cond
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.cond ]
+ %arrayidx = getelementptr inbounds i32, i32* %vec, i64 %indvars.iv
+ %1 = load i32, i32* %arrayidx, align 4
+ %cmp1 = icmp slt i32 %1, %limit
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ br i1 %cmp1, label %for.cond, label %cleanup
+
+cleanup: ; preds = %for.body, %for.cond, %entry
+ %2 = phi i32 [ 0, %entry ], [ 0, %for.cond ], [ 1, %for.body ]
+ ret i32 %2
+; CHECK-LABEL: limit_loop
+; CHECK: mtctr
+; CHECK-NOT: addi {{[0-9]+}}, {{[0-9]+}}, 1
+; CHECK: bdnz
+; CHECK: blr
+}
+
+
diff --git a/test/CodeGen/PowerPC/ppc-redzone-alignment-bug.ll b/test/CodeGen/PowerPC/ppc-redzone-alignment-bug.ll
new file mode 100644
index 000000000000..87b45beeab7e
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc-redzone-alignment-bug.ll
@@ -0,0 +1,32 @@
+; Note the formula for negative number alignment calculation should be y = x & ~(n-1) rather than y = (x + (n-1)) & ~(n-1).
+; after patch https://reviews.llvm.org/D34337, we could save 16 bytes in the best case.
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK-LE
+
+define signext i32 @bar(i32 signext %ii) {
+entry:
+ %0 = tail call i32 asm sideeffect "add $0, $1, $2\0A", "=r,r,r,~{f14},~{r15},~{v20}"(i32 %ii, i32 10)
+ ret i32 %0
+; Before the fix by patch D34337:
+; stdu 1, -544(1)
+; std 15, 264(1)
+; stfd 14, 400(1)
+; stdu 1, -560(1)
+; std 15, 280(1)
+; stfd 14, 416(1)
+
+; After the fix by patch D34337:
+; CHECK-LE: stdu 1, -528(1)
+; CHECK-LE:std 15, 248(1)
+; CHECK-LE:stfd 14, 384(1)
+; CHECK-BE: stdu 1, -544(1)
+; CHECK-BE:std 15, 264(1)
+; CHECK-BE:stfd 14, 400(1)
+}
+
+define signext i32 @foo() {
+entry:
+ %call = tail call signext i32 @bar(i32 signext 5)
+ ret i32 %call
+}
+
diff --git a/test/CodeGen/PowerPC/ppc64le-smallarg.ll b/test/CodeGen/PowerPC/ppc64le-smallarg.ll
index 0e871c358869..3a425406d043 100644
--- a/test/CodeGen/PowerPC/ppc64le-smallarg.ll
+++ b/test/CodeGen/PowerPC/ppc64le-smallarg.ll
@@ -53,8 +53,8 @@ entry:
ret void
}
; CHECK: @caller2
-; CHECK: li [[TOCOFF:[0-9]+]], 136
-; CHECK: stxsspx {{[0-9]+}}, 1, [[TOCOFF]]
+; CHECK: addi [[TOCOFF:[0-9]+]], {{[0-9]+}}, 136
+; CHECK: stxsspx {{[0-9]+}}, 0, [[TOCOFF]]
; CHECK: bl test2
declare float @test2(float, float, float, float, float, float, float, float, float, float, float, float, float, float)
diff --git a/test/CodeGen/PowerPC/pr33093.ll b/test/CodeGen/PowerPC/pr33093.ll
new file mode 100644
index 000000000000..5212973f8317
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr33093.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+
+define zeroext i32 @ReverseBits(i32 zeroext %n) {
+; CHECK-LABEL: ReverseBits:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lis 4, -21846
+; CHECK-NEXT: lis 5, 21845
+; CHECK-NEXT: slwi 6, 3, 1
+; CHECK-NEXT: srwi 3, 3, 1
+; CHECK-NEXT: lis 7, -13108
+; CHECK-NEXT: lis 8, 13107
+; CHECK-NEXT: ori 4, 4, 43690
+; CHECK-NEXT: ori 5, 5, 21845
+; CHECK-NEXT: lis 10, -3856
+; CHECK-NEXT: lis 11, 3855
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: and 4, 6, 4
+; CHECK-NEXT: ori 5, 8, 13107
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: ori 4, 7, 52428
+; CHECK-NEXT: slwi 9, 3, 2
+; CHECK-NEXT: srwi 3, 3, 2
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: and 4, 9, 4
+; CHECK-NEXT: ori 5, 11, 3855
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: ori 4, 10, 61680
+; CHECK-NEXT: slwi 12, 3, 4
+; CHECK-NEXT: srwi 3, 3, 4
+; CHECK-NEXT: and 4, 12, 4
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: rotlwi 4, 3, 24
+; CHECK-NEXT: rlwimi 4, 3, 8, 8, 15
+; CHECK-NEXT: rlwimi 4, 3, 8, 24, 31
+; CHECK-NEXT: rldicl 3, 4, 0, 32
+; CHECK-NEXT: clrldi 3, 3, 32
+; CHECK-NEXT: blr
+entry:
+ %shr = lshr i32 %n, 1
+ %and = and i32 %shr, 1431655765
+ %and1 = shl i32 %n, 1
+ %shl = and i32 %and1, -1431655766
+ %or = or i32 %and, %shl
+ %shr2 = lshr i32 %or, 2
+ %and3 = and i32 %shr2, 858993459
+ %and4 = shl i32 %or, 2
+ %shl5 = and i32 %and4, -858993460
+ %or6 = or i32 %and3, %shl5
+ %shr7 = lshr i32 %or6, 4
+ %and8 = and i32 %shr7, 252645135
+ %and9 = shl i32 %or6, 4
+ %shl10 = and i32 %and9, -252645136
+ %or11 = or i32 %and8, %shl10
+ %shr13 = lshr i32 %or11, 24
+ %and14 = lshr i32 %or11, 8
+ %shr15 = and i32 %and14, 65280
+ %and17 = shl i32 %or11, 8
+ %shl18 = and i32 %and17, 16711680
+ %shl21 = shl i32 %or11, 24
+ %or16 = or i32 %shl21, %shr13
+ %or19 = or i32 %or16, %shr15
+ %or22 = or i32 %or19, %shl18
+ ret i32 %or22
+}
+
+define i64 @ReverseBits64(i64 %n) {
+; CHECK-LABEL: ReverseBits64:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lis 4, -21846
+; CHECK-NEXT: lis 5, 21845
+; CHECK-NEXT: lis 6, -13108
+; CHECK-NEXT: lis 7, 13107
+; CHECK-NEXT: sldi 8, 3, 1
+; CHECK-NEXT: rldicl 3, 3, 63, 1
+; CHECK-NEXT: ori 4, 4, 43690
+; CHECK-NEXT: ori 5, 5, 21845
+; CHECK-NEXT: ori 6, 6, 52428
+; CHECK-NEXT: ori 7, 7, 13107
+; CHECK-NEXT: sldi 4, 4, 32
+; CHECK-NEXT: sldi 5, 5, 32
+; CHECK-NEXT: oris 4, 4, 43690
+; CHECK-NEXT: oris 5, 5, 21845
+; CHECK-NEXT: ori 4, 4, 43690
+; CHECK-NEXT: ori 5, 5, 21845
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: sldi 5, 6, 32
+; CHECK-NEXT: sldi 6, 7, 32
+; CHECK-NEXT: and 4, 8, 4
+; CHECK-NEXT: lis 7, 3855
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: oris 12, 5, 52428
+; CHECK-NEXT: oris 9, 6, 13107
+; CHECK-NEXT: lis 6, -3856
+; CHECK-NEXT: ori 7, 7, 3855
+; CHECK-NEXT: sldi 8, 3, 2
+; CHECK-NEXT: ori 4, 12, 52428
+; CHECK-NEXT: rldicl 3, 3, 62, 2
+; CHECK-NEXT: ori 5, 9, 13107
+; CHECK-NEXT: ori 6, 6, 61680
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: sldi 5, 6, 32
+; CHECK-NEXT: and 4, 8, 4
+; CHECK-NEXT: sldi 6, 7, 32
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: oris 10, 5, 61680
+; CHECK-NEXT: oris 11, 6, 3855
+; CHECK-NEXT: sldi 6, 3, 4
+; CHECK-NEXT: ori 4, 10, 61680
+; CHECK-NEXT: rldicl 3, 3, 60, 4
+; CHECK-NEXT: ori 5, 11, 3855
+; CHECK-NEXT: and 4, 6, 4
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: rldicl 4, 3, 32, 32
+; CHECK-NEXT: rlwinm 6, 3, 24, 0, 31
+; CHECK-NEXT: rlwinm 5, 4, 24, 0, 31
+; CHECK-NEXT: rlwimi 6, 3, 8, 8, 15
+; CHECK-NEXT: rlwimi 5, 4, 8, 8, 15
+; CHECK-NEXT: rlwimi 6, 3, 8, 24, 31
+; CHECK-NEXT: rlwimi 5, 4, 8, 24, 31
+; CHECK-NEXT: sldi 12, 5, 32
+; CHECK-NEXT: or 3, 12, 6
+; CHECK-NEXT: blr
+entry:
+ %shr = lshr i64 %n, 1
+ %and = and i64 %shr, 6148914691236517205
+ %and1 = shl i64 %n, 1
+ %shl = and i64 %and1, -6148914691236517206
+ %or = or i64 %and, %shl
+ %shr2 = lshr i64 %or, 2
+ %and3 = and i64 %shr2, 3689348814741910323
+ %and4 = shl i64 %or, 2
+ %shl5 = and i64 %and4, -3689348814741910324
+ %or6 = or i64 %and3, %shl5
+ %shr7 = lshr i64 %or6, 4
+ %and8 = and i64 %shr7, 1085102592571150095
+ %and9 = shl i64 %or6, 4
+ %shl10 = and i64 %and9, -1085102592571150096
+ %or11 = or i64 %and8, %shl10
+ %shr13 = lshr i64 %or11, 56
+ %and14 = lshr i64 %or11, 40
+ %shr15 = and i64 %and14, 65280
+ %and17 = lshr i64 %or11, 24
+ %shr18 = and i64 %and17, 16711680
+ %and20 = lshr i64 %or11, 8
+ %shr21 = and i64 %and20, 4278190080
+ %and23 = shl i64 %or11, 8
+ %shl24 = and i64 %and23, 1095216660480
+ %and26 = shl i64 %or11, 24
+ %shl27 = and i64 %and26, 280375465082880
+ %and29 = shl i64 %or11, 40
+ %shl30 = and i64 %and29, 71776119061217280
+ %shl33 = shl i64 %or11, 56
+ %or16 = or i64 %shl33, %shr13
+ %or19 = or i64 %or16, %shr15
+ %or22 = or i64 %or19, %shr18
+ %or25 = or i64 %or22, %shr21
+ %or28 = or i64 %or25, %shl24
+ %or31 = or i64 %or28, %shl27
+ %or34 = or i64 %or31, %shl30
+ ret i64 %or34
+}
diff --git a/test/CodeGen/PowerPC/select-addrRegRegOnly.ll b/test/CodeGen/PowerPC/select-addrRegRegOnly.ll
new file mode 100644
index 000000000000..f880d1faf9d9
--- /dev/null
+++ b/test/CodeGen/PowerPC/select-addrRegRegOnly.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64-unknown-unknown -verify-machineinstrs < %s | FileCheck %s
+
+; Function Attrs: norecurse nounwind readonly
+define float @testSingleAccess(i32* nocapture readonly %arr) local_unnamed_addr #0 {
+; CHECK-LABEL: testSingleAccess:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addi 3, 3, 8
+; CHECK-NEXT: lxsiwax 0, 0, 3
+; CHECK-NEXT: xscvsxdsp 1, 0
+; CHECK-NEXT: blr
+entry:
+ %arrayidx = getelementptr inbounds i32, i32* %arr, i64 2
+ %0 = load i32, i32* %arrayidx, align 4
+ %conv = sitofp i32 %0 to float
+ ret float %conv
+}
+
+; Function Attrs: norecurse nounwind readonly
+define float @testMultipleAccess(i32* nocapture readonly %arr) local_unnamed_addr #0 {
+; CHECK-LABEL: testMultipleAccess:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lwz 4, 8(3)
+; CHECK-NEXT: lwz 12, 12(3)
+; CHECK-NEXT: add 3, 12, 4
+; CHECK-NEXT: mtvsrwa 0, 3
+; CHECK-NEXT: xscvsxdsp 1, 0
+; CHECK-NEXT: blr
+entry:
+ %arrayidx = getelementptr inbounds i32, i32* %arr, i64 2
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 3
+ %1 = load i32, i32* %arrayidx1, align 4
+ %add = add nsw i32 %1, %0
+ %conv = sitofp i32 %add to float
+ ret float %conv
+}
diff --git a/test/CodeGen/PowerPC/svr4-redzone.ll b/test/CodeGen/PowerPC/svr4-redzone.ll
index 7bb6cc180c96..26c4410ded6d 100644
--- a/test/CodeGen/PowerPC/svr4-redzone.ll
+++ b/test/CodeGen/PowerPC/svr4-redzone.ll
@@ -29,11 +29,11 @@ entry:
define i8* @bigstack() nounwind {
entry:
- %0 = alloca i8, i32 230
+ %0 = alloca i8, i32 290
ret i8* %0
}
; PPC32-LABEL: bigstack:
-; PPC32: stwu 1, -240(1)
+; PPC32: stwu 1, -304(1)
; PPC64-LABEL: bigstack:
-; PPC64: stdu 1, -288(1)
+; PPC64: stdu 1, -352(1)
diff --git a/test/CodeGen/PowerPC/tailcall1-64.ll b/test/CodeGen/PowerPC/tailcall1-64.ll
index 3dc2672556ea..58ab0bce309c 100644
--- a/test/CodeGen/PowerPC/tailcall1-64.ll
+++ b/test/CodeGen/PowerPC/tailcall1-64.ll
@@ -1,4 +1,5 @@
-; RUN: llc -relocation-model=static -verify-machineinstrs < %s -march=ppc64 -tailcallopt | grep TC_RETURNd8
+; RUN: llc -relocation-model=static -verify-machineinstrs < %s -mtriple=ppc64-- -tailcallopt | grep TC_RETURNd8
+; RUN: llc -relocation-model=static -verify-machineinstrs -mtriple=ppc64-- < %s | FileCheck %s
define fastcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
entry:
ret i32 %a3
@@ -6,6 +7,8 @@ entry:
define fastcc i32 @tailcaller(i32 %in1, i32 %in2) {
entry:
- %tmp11 = tail call fastcc i32 @tailcallee( i32 %in1, i32 %in2, i32 %in1, i32 %in2 ) ; <i32> [#uses=1]
+ %tmp11 = tail call fastcc i32 @tailcallee( i32 %in1, i32 %in2, i32 %in1, i32 %in2 )
ret i32 %tmp11
+; CHECK-LABEL: tailcaller
+; CHECK-NOT: stdu
}
diff --git a/test/CodeGen/PowerPC/testBitReverse.ll b/test/CodeGen/PowerPC/testBitReverse.ll
new file mode 100644
index 000000000000..6993d17ad8f3
--- /dev/null
+++ b/test/CodeGen/PowerPC/testBitReverse.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+declare i32 @llvm.bitreverse.i32(i32)
+define i32 @testBitReverseIntrinsicI32(i32 %arg) {
+; CHECK-LABEL: testBitReverseIntrinsicI32:
+; CHECK: # BB#0:
+; CHECK-NEXT: lis 4, -21846
+; CHECK-NEXT: lis 5, 21845
+; CHECK-NEXT: slwi 6, 3, 1
+; CHECK-NEXT: srwi 3, 3, 1
+; CHECK-NEXT: lis 7, -13108
+; CHECK-NEXT: lis 8, 13107
+; CHECK-NEXT: ori 4, 4, 43690
+; CHECK-NEXT: ori 5, 5, 21845
+; CHECK-NEXT: lis 10, -3856
+; CHECK-NEXT: lis 11, 3855
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: and 4, 6, 4
+; CHECK-NEXT: ori 5, 8, 13107
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: ori 4, 7, 52428
+; CHECK-NEXT: slwi 9, 3, 2
+; CHECK-NEXT: srwi 3, 3, 2
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: and 4, 9, 4
+; CHECK-NEXT: ori 5, 11, 3855
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: ori 4, 10, 61680
+; CHECK-NEXT: slwi 12, 3, 4
+; CHECK-NEXT: srwi 3, 3, 4
+; CHECK-NEXT: and 4, 12, 4
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: rotlwi 4, 3, 24
+; CHECK-NEXT: rlwimi 4, 3, 8, 8, 15
+; CHECK-NEXT: rlwimi 4, 3, 8, 24, 31
+; CHECK-NEXT: rldicl 3, 4, 0, 32
+; CHECK-NEXT: blr
+ %res = call i32 @llvm.bitreverse.i32(i32 %arg)
+ ret i32 %res
+}
+
+declare i64 @llvm.bitreverse.i64(i64)
+define i64 @testBitReverseIntrinsicI64(i64 %arg) {
+; CHECK-LABEL: testBitReverseIntrinsicI64:
+; CHECK: # BB#0:
+; CHECK-NEXT: lis 4, -21846
+; CHECK-NEXT: lis 5, 21845
+; CHECK-NEXT: lis 6, -13108
+; CHECK-NEXT: lis 7, 13107
+; CHECK-NEXT: sldi 8, 3, 1
+; CHECK-NEXT: rldicl 3, 3, 63, 1
+; CHECK-NEXT: ori 4, 4, 43690
+; CHECK-NEXT: ori 5, 5, 21845
+; CHECK-NEXT: ori 6, 6, 52428
+; CHECK-NEXT: ori 7, 7, 13107
+; CHECK-NEXT: sldi 4, 4, 32
+; CHECK-NEXT: sldi 5, 5, 32
+; CHECK-NEXT: oris 4, 4, 43690
+; CHECK-NEXT: oris 5, 5, 21845
+; CHECK-NEXT: ori 4, 4, 43690
+; CHECK-NEXT: ori 5, 5, 21845
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: sldi 5, 6, 32
+; CHECK-NEXT: sldi 6, 7, 32
+; CHECK-NEXT: and 4, 8, 4
+; CHECK-NEXT: lis 7, 3855
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: oris 12, 5, 52428
+; CHECK-NEXT: oris 9, 6, 13107
+; CHECK-NEXT: lis 6, -3856
+; CHECK-NEXT: ori 7, 7, 3855
+; CHECK-NEXT: sldi 8, 3, 2
+; CHECK-NEXT: ori 4, 12, 52428
+; CHECK-NEXT: rldicl 3, 3, 62, 2
+; CHECK-NEXT: ori 5, 9, 13107
+; CHECK-NEXT: ori 6, 6, 61680
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: sldi 5, 6, 32
+; CHECK-NEXT: and 4, 8, 4
+; CHECK-NEXT: sldi 6, 7, 32
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: oris 10, 5, 61680
+; CHECK-NEXT: oris 11, 6, 3855
+; CHECK-NEXT: sldi 6, 3, 4
+; CHECK-NEXT: ori 4, 10, 61680
+; CHECK-NEXT: rldicl 3, 3, 60, 4
+; CHECK-NEXT: ori 5, 11, 3855
+; CHECK-NEXT: and 4, 6, 4
+; CHECK-NEXT: and 3, 3, 5
+; CHECK-NEXT: or 3, 3, 4
+; CHECK-NEXT: rldicl 4, 3, 32, 32
+; CHECK-NEXT: rlwinm 6, 3, 24, 0, 31
+; CHECK-NEXT: rlwinm 5, 4, 24, 0, 31
+; CHECK-NEXT: rlwimi 6, 3, 8, 8, 15
+; CHECK-NEXT: rlwimi 5, 4, 8, 8, 15
+; CHECK-NEXT: rlwimi 6, 3, 8, 24, 31
+; CHECK-NEXT: rlwimi 5, 4, 8, 24, 31
+; CHECK-NEXT: sldi 12, 5, 32
+; CHECK-NEXT: or 3, 12, 6
+; CHECK-NEXT: blr
+ %res = call i64 @llvm.bitreverse.i64(i64 %arg)
+ ret i64 %res
+}
diff --git a/test/CodeGen/PowerPC/vec_extract_p9.ll b/test/CodeGen/PowerPC/vec_extract_p9.ll
new file mode 100644
index 000000000000..241209a0e6b7
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_extract_p9.ll
@@ -0,0 +1,167 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-gnu-linux -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-LE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-gnu-linux -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-BE
+
+define zeroext i8 @test1(<16 x i8> %a, i32 signext %index) {
+; CHECK-LE-LABEL: test1:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vextubrx 3, 5, 2
+; CHECK-LE-NEXT: clrldi 3, 3, 56
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: test1:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: vextublx 3, 5, 2
+; CHECK-BE-NEXT: clrldi 3, 3, 56
+; CHECK-BE-NEXT: blr
+
+entry:
+ %vecext = extractelement <16 x i8> %a, i32 %index
+ ret i8 %vecext
+}
+
+define signext i8 @test2(<16 x i8> %a, i32 signext %index) {
+; CHECK-LE-LABEL: test2:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vextubrx 3, 5, 2
+; CHECK-LE-NEXT: extsb 3, 3
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: test2:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: vextublx 3, 5, 2
+; CHECK-BE-NEXT: extsb 3, 3
+; CHECK-BE-NEXT: blr
+
+entry:
+ %vecext = extractelement <16 x i8> %a, i32 %index
+ ret i8 %vecext
+}
+
+define zeroext i16 @test3(<8 x i16> %a, i32 signext %index) {
+; CHECK-LE-LABEL: test3:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: rlwinm 3, 5, 1, 28, 30
+; CHECK-LE-NEXT: vextuhrx 3, 3, 2
+; CHECK-LE-NEXT: clrldi 3, 3, 48
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: test3:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: rlwinm 3, 5, 1, 28, 30
+; CHECK-BE-NEXT: vextuhlx 3, 3, 2
+; CHECK-BE-NEXT: clrldi 3, 3, 48
+; CHECK-BE-NEXT: blr
+
+entry:
+ %vecext = extractelement <8 x i16> %a, i32 %index
+ ret i16 %vecext
+}
+
+define signext i16 @test4(<8 x i16> %a, i32 signext %index) {
+; CHECK-LE-LABEL: test4:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: rlwinm 3, 5, 1, 28, 30
+; CHECK-LE-NEXT: vextuhrx 3, 3, 2
+; CHECK-LE-NEXT: extsh 3, 3
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: test4:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: rlwinm 3, 5, 1, 28, 30
+; CHECK-BE-NEXT: vextuhlx 3, 3, 2
+; CHECK-BE-NEXT: extsh 3, 3
+; CHECK-BE-NEXT: blr
+
+entry:
+ %vecext = extractelement <8 x i16> %a, i32 %index
+ ret i16 %vecext
+}
+
+define zeroext i32 @test5(<4 x i32> %a, i32 signext %index) {
+; CHECK-LE-LABEL: test5:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: rlwinm 3, 5, 2, 28, 29
+; CHECK-LE-NEXT: vextuwrx 3, 3, 2
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: test5:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: rlwinm 3, 5, 2, 28, 29
+; CHECK-BE-NEXT: vextuwlx 3, 3, 2
+; CHECK-BE-NEXT: blr
+
+entry:
+ %vecext = extractelement <4 x i32> %a, i32 %index
+ ret i32 %vecext
+}
+
+define signext i32 @test6(<4 x i32> %a, i32 signext %index) {
+; CHECK-LE-LABEL: test6:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: rlwinm 3, 5, 2, 28, 29
+; CHECK-LE-NEXT: vextuwrx 3, 3, 2
+; CHECK-LE-NEXT: extsw 3, 3
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: test6:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: rlwinm 3, 5, 2, 28, 29
+; CHECK-BE-NEXT: vextuwlx 3, 3, 2
+; CHECK-BE-NEXT: extsw 3, 3
+; CHECK-BE-NEXT: blr
+
+entry:
+ %vecext = extractelement <4 x i32> %a, i32 %index
+ ret i32 %vecext
+}
+
+; Test with immediate index
+define zeroext i8 @test7(<16 x i8> %a) {
+; CHECK-LE-LABEL: test7:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: li 3, 1
+; CHECK-LE-NEXT: vextubrx 3, 3, 2
+; CHECK-LE-NEXT: clrldi 3, 3, 56
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: test7:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: li 3, 1
+; CHECK-BE-NEXT: vextublx 3, 3, 2
+; CHECK-BE-NEXT: clrldi 3, 3, 56
+; CHECK-BE-NEXT: blr
+
+entry:
+ %vecext = extractelement <16 x i8> %a, i32 1
+ ret i8 %vecext
+}
+
+define zeroext i16 @test8(<8 x i16> %a) {
+; CHECK-LE-LABEL: test8:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: li 3, 2
+; CHECK-LE-NEXT: vextuhrx 3, 3, 2
+; CHECK-LE-NEXT: clrldi 3, 3, 48
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: test8:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: li 3, 2
+; CHECK-BE-NEXT: vextuhlx 3, 3, 2
+; CHECK-BE-NEXT: clrldi 3, 3, 48
+; CHECK-BE-NEXT: blr
+
+entry:
+ %vecext = extractelement <8 x i16> %a, i32 1
+ ret i16 %vecext
+}
+
+define zeroext i32 @test9(<4 x i32> %a) {
+; CHECK-LE-LABEL: test9:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: li 3, 4
+; CHECK-LE-NEXT: vextuwrx 3, 3, 2
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: test9:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: li 3, 4
+; CHECK-BE-NEXT: vextuwlx 3, 3, 2
+; CHECK-BE-NEXT: blr
+
+entry:
+ %vecext = extractelement <4 x i32> %a, i32 1
+ ret i32 %vecext
+}
diff --git a/test/CodeGen/PowerPC/vec_int_ext.ll b/test/CodeGen/PowerPC/vec_int_ext.ll
index 9e1218c423b7..d7bed503318e 100644
--- a/test/CodeGen/PowerPC/vec_int_ext.ll
+++ b/test/CodeGen/PowerPC/vec_int_ext.ll
@@ -1,12 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs -mcpu=pwr9 < %s | FileCheck %s -check-prefix=PWR9
-target triple = "powerpc64le-unknown-linux-gnu"
-
-define <4 x i32> @vextsb2w(<16 x i8> %a) {
-; PWR9-LABEL: vextsb2w:
-; PWR9: # BB#0: # %entry
-; PWR9-NEXT: vextsb2w 2, 2
-; PWR9-NEXT: blr
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-gnu-linux -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-LE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-gnu-linux -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-BE
+
+define <4 x i32> @vextsb2wLE(<16 x i8> %a) {
+; CHECK-LE-LABEL: vextsb2wLE:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vextsb2w 2, 2
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: vextsb2wLE:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE: vperm 2, 2, 2, 3
+; CHECK-BE-NEXT: vextsb2w 2, 2
+; CHECK-BE-NEXT: blr
+
entry:
%vecext = extractelement <16 x i8> %a, i32 0
%conv = sext i8 %vecext to i32
@@ -23,11 +29,17 @@ entry:
ret <4 x i32> %vecinit9
}
-define <2 x i64> @vextsb2d(<16 x i8> %a) {
-; PWR9-LABEL: vextsb2d:
-; PWR9: # BB#0: # %entry
-; PWR9-NEXT: vextsb2d 2, 2
-; PWR9-NEXT: blr
+define <2 x i64> @vextsb2dLE(<16 x i8> %a) {
+; CHECK-LE-LABEL: vextsb2dLE:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vextsb2d 2, 2
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: vextsb2dLE:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE: vperm 2, 2, 2, 3
+; CHECK-BE-NEXT: vextsb2d 2, 2
+; CHECK-BE-NEXT: blr
+
entry:
%vecext = extractelement <16 x i8> %a, i32 0
%conv = sext i8 %vecext to i64
@@ -38,11 +50,17 @@ entry:
ret <2 x i64> %vecinit3
}
-define <4 x i32> @vextsh2w(<8 x i16> %a) {
-; PWR9-LABEL: vextsh2w:
-; PWR9: # BB#0: # %entry
-; PWR9-NEXT: vextsh2w 2, 2
-; PWR9-NEXT: blr
+define <4 x i32> @vextsh2wLE(<8 x i16> %a) {
+; CHECK-LE-LABEL: vextsh2wLE:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vextsh2w 2, 2
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: vextsh2wLE:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE: vperm 2, 2, 2, 3
+; CHECK-BE-NEXT: vextsh2w 2, 2
+; CHECK-BE-NEXT: blr
+
entry:
%vecext = extractelement <8 x i16> %a, i32 0
%conv = sext i16 %vecext to i32
@@ -59,11 +77,17 @@ entry:
ret <4 x i32> %vecinit9
}
-define <2 x i64> @vextsh2d(<8 x i16> %a) {
-; PWR9-LABEL: vextsh2d:
-; PWR9: # BB#0: # %entry
-; PWR9-NEXT: vextsh2d 2, 2
-; PWR9-NEXT: blr
+define <2 x i64> @vextsh2dLE(<8 x i16> %a) {
+; CHECK-LE-LABEL: vextsh2dLE:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vextsh2d 2, 2
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: vextsh2dLE:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE: vperm 2, 2, 2, 3
+; CHECK-BE-NEXT: vextsh2d 2, 2
+; CHECK-BE-NEXT: blr
+
entry:
%vecext = extractelement <8 x i16> %a, i32 0
%conv = sext i16 %vecext to i64
@@ -74,11 +98,17 @@ entry:
ret <2 x i64> %vecinit3
}
-define <2 x i64> @vextsw2d(<4 x i32> %a) {
-; PWR9-LABEL: vextsw2d:
-; PWR9: # BB#0: # %entry
-; PWR9-NEXT: vextsw2d 2, 2
-; PWR9-NEXT: blr
+define <2 x i64> @vextsw2dLE(<4 x i32> %a) {
+; CHECK-LE-LABEL: vextsw2dLE:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vextsw2d 2, 2
+; CHECK-LE-NEXT: blr
+; CHECK-BE-LABEL: vextsw2dLE:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE: vmrgew
+; CHECK-BE-NEXT: vextsw2d 2, 2
+; CHECK-BE-NEXT: blr
+
entry:
%vecext = extractelement <4 x i32> %a, i32 0
%conv = sext i32 %vecext to i64
@@ -88,3 +118,170 @@ entry:
%vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
ret <2 x i64> %vecinit3
}
+
+define <4 x i32> @vextsb2wBE(<16 x i8> %a) {
+; CHECK-BE-LABEL: vextsb2wBE:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: vextsb2w 2, 2
+; CHECK-BE-NEXT: blr
+; CHECK-LE-LABEL: vextsb2wBE:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vsldoi 2, 2, 2, 13
+; CHECK-LE-NEXT: vextsb2w 2, 2
+; CHECK-LE-NEXT: blr
+entry:
+ %vecext = extractelement <16 x i8> %a, i32 3
+ %conv = sext i8 %vecext to i32
+ %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+ %vecext1 = extractelement <16 x i8> %a, i32 7
+ %conv2 = sext i8 %vecext1 to i32
+ %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1
+ %vecext4 = extractelement <16 x i8> %a, i32 11
+ %conv5 = sext i8 %vecext4 to i32
+ %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2
+ %vecext7 = extractelement <16 x i8> %a, i32 15
+ %conv8 = sext i8 %vecext7 to i32
+ %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3
+ ret <4 x i32> %vecinit9
+}
+
+define <2 x i64> @vextsb2dBE(<16 x i8> %a) {
+; CHECK-BE-LABEL: vextsb2dBE:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: vextsb2d 2, 2
+; CHECK-BE-NEXT: blr
+; CHECK-LE-LABEL: vextsb2dBE:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vsldoi 2, 2, 2, 9
+; CHECK-LE-NEXT: vextsb2d 2, 2
+; CHECK-LE-NEXT: blr
+entry:
+ %vecext = extractelement <16 x i8> %a, i32 7
+ %conv = sext i8 %vecext to i64
+ %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+ %vecext1 = extractelement <16 x i8> %a, i32 15
+ %conv2 = sext i8 %vecext1 to i64
+ %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+ ret <2 x i64> %vecinit3
+}
+
+define <4 x i32> @vextsh2wBE(<8 x i16> %a) {
+; CHECK-BE-LABEL: vextsh2wBE:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: vextsh2w 2, 2
+; CHECK-BE-NEXT: blr
+; CHECK-LE-LABEL: vextsh2wBE:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vsldoi 2, 2, 2, 14
+; CHECK-LE-NEXT: vextsh2w 2, 2
+; CHECK-LE-NEXT: blr
+entry:
+ %vecext = extractelement <8 x i16> %a, i32 1
+ %conv = sext i16 %vecext to i32
+ %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+ %vecext1 = extractelement <8 x i16> %a, i32 3
+ %conv2 = sext i16 %vecext1 to i32
+ %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1
+ %vecext4 = extractelement <8 x i16> %a, i32 5
+ %conv5 = sext i16 %vecext4 to i32
+ %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2
+ %vecext7 = extractelement <8 x i16> %a, i32 7
+ %conv8 = sext i16 %vecext7 to i32
+ %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3
+ ret <4 x i32> %vecinit9
+}
+
+define <2 x i64> @vextsh2dBE(<8 x i16> %a) {
+; CHECK-BE-LABEL: vextsh2dBE:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: vextsh2d 2, 2
+; CHECK-BE-NEXT: blr
+; CHECK-LE-LABEL: vextsh2dBE:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vsldoi 2, 2, 2, 10
+; CHECK-LE-NEXT: vextsh2d 2, 2
+; CHECK-LE-NEXT: blr
+entry:
+ %vecext = extractelement <8 x i16> %a, i32 3
+ %conv = sext i16 %vecext to i64
+ %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+ %vecext1 = extractelement <8 x i16> %a, i32 7
+ %conv2 = sext i16 %vecext1 to i64
+ %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+ ret <2 x i64> %vecinit3
+}
+
+define <2 x i64> @vextsw2dBE(<4 x i32> %a) {
+; CHECK-BE-LABEL: vextsw2dBE:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NEXT: vextsw2d 2, 2
+; CHECK-BE-NEXT: blr
+; CHECK-LE-LABEL: vextsw2dBE:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NEXT: vsldoi 2, 2, 2, 12
+; CHECK-LE-NEXT: vextsw2d 2, 2
+; CHECK-LE-NEXT: blr
+entry:
+ %vecext = extractelement <4 x i32> %a, i32 1
+ %conv = sext i32 %vecext to i64
+ %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+ %vecext1 = extractelement <4 x i32> %a, i32 3
+ %conv2 = sext i32 %vecext1 to i64
+ %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+ ret <2 x i64> %vecinit3
+}
+
+define <2 x i64> @vextDiffVectors(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LE-LABEL: vextDiffVectors:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NOT: vextsw2d
+
+; CHECK-BE-LABEL: vextDiffVectors:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NOT: vextsw2d
+entry:
+ %vecext = extractelement <4 x i32> %a, i32 0
+ %conv = sext i32 %vecext to i64
+ %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+ %vecext1 = extractelement <4 x i32> %b, i32 2
+ %conv2 = sext i32 %vecext1 to i64
+ %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+ ret <2 x i64> %vecinit3
+}
+
+define <8 x i16> @testInvalidExtend(<16 x i8> %a) {
+entry:
+; CHECK-LE-LABEL: testInvalidExtend:
+; CHECK-LE: # BB#0: # %entry
+; CHECK-LE-NOT: vexts
+
+; CHECK-BE-LABEL: testInvalidExtend:
+; CHECK-BE: # BB#0: # %entry
+; CHECK-BE-NOT: vexts
+
+ %vecext = extractelement <16 x i8> %a, i32 0
+ %conv = sext i8 %vecext to i16
+ %vecinit = insertelement <8 x i16> undef, i16 %conv, i32 0
+ %vecext1 = extractelement <16 x i8> %a, i32 2
+ %conv2 = sext i8 %vecext1 to i16
+ %vecinit3 = insertelement <8 x i16> %vecinit, i16 %conv2, i32 1
+ %vecext4 = extractelement <16 x i8> %a, i32 4
+ %conv5 = sext i8 %vecext4 to i16
+ %vecinit6 = insertelement <8 x i16> %vecinit3, i16 %conv5, i32 2
+ %vecext7 = extractelement <16 x i8> %a, i32 6
+ %conv8 = sext i8 %vecext7 to i16
+ %vecinit9 = insertelement <8 x i16> %vecinit6, i16 %conv8, i32 3
+ %vecext10 = extractelement <16 x i8> %a, i32 8
+ %conv11 = sext i8 %vecext10 to i16
+ %vecinit12 = insertelement <8 x i16> %vecinit9, i16 %conv11, i32 4
+ %vecext13 = extractelement <16 x i8> %a, i32 10
+ %conv14 = sext i8 %vecext13 to i16
+ %vecinit15 = insertelement <8 x i16> %vecinit12, i16 %conv14, i32 5
+ %vecext16 = extractelement <16 x i8> %a, i32 12
+ %conv17 = sext i8 %vecext16 to i16
+ %vecinit18 = insertelement <8 x i16> %vecinit15, i16 %conv17, i32 6
+ %vecext19 = extractelement <16 x i8> %a, i32 14
+ %conv20 = sext i8 %vecext19 to i16
+ %vecinit21 = insertelement <8 x i16> %vecinit18, i16 %conv20, i32 7
+ ret <8 x i16> %vecinit21
+}
diff --git a/test/CodeGen/PowerPC/vsx-partword-int-loads-and-stores.ll b/test/CodeGen/PowerPC/vsx-partword-int-loads-and-stores.ll
index 67146e40db0e..5346d8a429fb 100644
--- a/test/CodeGen/PowerPC/vsx-partword-int-loads-and-stores.ll
+++ b/test/CodeGen/PowerPC/vsx-partword-int-loads-and-stores.ll
@@ -321,8 +321,8 @@ entry:
; CHECK: lxsibzx 34, 0, 3
; CHECK-NEXT: vspltb 2, 2, 7
; CHECK-BE-LABEL: vecucus
-; CHECK-BE: li [[OFFSET:[0-9]+]], 1
-; CHECK-BE-NEXT: lxsibzx 34, 3, [[OFFSET]]
+; CHECK-BE: addi [[OFFSET:[0-9]+]], [[OFFSET]], 1
+; CHECK-BE-NEXT: lxsibzx 34, 0, [[OFFSET]]
; CHECK-BE-NEXT: vspltb 2, 2, 7
}
@@ -385,8 +385,8 @@ entry:
; CHECK: lxsibzx 34, 0, 3
; CHECK-NEXT: vspltb 2, 2, 7
; CHECK-BE-LABEL: vecscus
-; CHECK-BE: li [[OFFSET:[0-9]+]], 1
-; CHECK-BE-NEXT: lxsibzx 34, 3, [[OFFSET]]
+; CHECK-BE: addi [[OFFSET:[0-9]+]], [[OFFSET]], 1
+; CHECK-BE-NEXT: lxsibzx 34, 0, [[OFFSET]]
; CHECK-BE-NEXT: vspltb 2, 2, 7
}
@@ -487,8 +487,8 @@ entry:
; CHECK: lxsibzx 34, 0, 3
; CHECK-NEXT: vspltb 2, 2, 7
; CHECK-BE-LABEL: vecucss
-; CHECK-BE: li [[OFFSET:[0-9]+]], 1
-; CHECK-BE-NEXT: lxsibzx 34, 3, [[OFFSET]]
+; CHECK-BE: addi [[OFFSET:[0-9]+]], [[OFFSET]], 1
+; CHECK-BE-NEXT: lxsibzx 34, 0, [[OFFSET]]
; CHECK-BE-NEXT: vspltb 2, 2, 7
}
@@ -540,8 +540,8 @@ entry:
; CHECK: lxsibzx 34, 0, 3
; CHECK-NEXT: vspltb 2, 2, 7
; CHECK-BE-LABEL: vecscss
-; CHECK-BE: li [[OFFSET:[0-9]+]], 1
-; CHECK-BE-NEXT: lxsibzx 34, 3, [[OFFSET]]
+; CHECK-BE: addi [[OFFSET:[0-9]+]], [[OFFSET]], 1
+; CHECK-BE-NEXT: lxsibzx 34, 0, [[OFFSET]]
; CHECK-BE-NEXT: vspltb 2, 2, 7
}
diff --git a/test/CodeGen/SystemZ/regalloc-fast-invalid-kill-flag.mir b/test/CodeGen/SystemZ/regalloc-fast-invalid-kill-flag.mir
new file mode 100644
index 000000000000..8798fcecfc3b
--- /dev/null
+++ b/test/CodeGen/SystemZ/regalloc-fast-invalid-kill-flag.mir
@@ -0,0 +1,34 @@
+# RUN: llc -verify-machineinstrs -run-pass regallocfast -mtriple s390x-ibm-linux -o - %s | FileCheck %s
+--- |
+
+ @g_167 = external global [5 x i64], align 8
+ define void @main() local_unnamed_addr {
+ ret void
+ }
+...
+# Make sure the usage of different subregisters on the same virtual register
+# does not result in invalid kill flags.
+# PR33677
+---
+name: main
+alignment: 2
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr128bit }
+ - { id: 1, class: gr64bit }
+ - { id: 2, class: addr64bit }
+# CHECK: %r0q = L128
+# CHECK-NEXT: %r0l = COPY %r1l
+# Although R0L partially redefines R0Q, it must not mark R0Q as kill
+# because R1D is still live through that instruction.
+# CHECK-NOT: %r0q<imp-use,kill>
+# CHECK-NEXT: %r2d = COPY %r1d
+# CHECK-NEXT: LARL
+body: |
+ bb.0:
+ %0.subreg_hl32 = COPY %0.subreg_l32
+ %1 = COPY %0.subreg_l64
+ %2 = LARL @g_167
+ STC %1.subreg_l32, %2, 8, _
+
+...
diff --git a/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll b/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll
index 9fcc0f5d617b..5c3800e97093 100644
--- a/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll
+++ b/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll
@@ -95,15 +95,17 @@ if.end:
}
; CHECK-LABEL: diamond2:
-; CHECK-BP: itte
-; CHECK-BP: streq
-; CHECK-BP: ldreq
-; CHECK-BP: strne
-; CHECK-NOBP: cbz
-; CHECK-NOBP: str
-; CHECK-NOBP: b
-; CHECK-NOBP: str
-; CHECK-NOBP: ldr
+; CHECK-BP: cbz
+; CHECK-BP: str
+; CHECK-BP: str
+; CHECK-BP: b
+; CHECK-BP: str
+; CHECK-BP: ldr
+; CHECK-NOBP: ittee
+; CHECK-NOBP: streq
+; CHECK-NOBP: ldreq
+; CHECK-NOBP: strne
+; CHECK-NOBP: strne
define i32 @diamond2(i32 %n, i32 %m, i32* %p, i32* %q) {
entry:
%tobool = icmp eq i32 %n, 0
@@ -111,6 +113,8 @@ entry:
if.then:
store i32 %n, i32* %p, align 4
+ %arrayidx = getelementptr inbounds i32, i32* %p, i32 2
+ store i32 %n, i32* %arrayidx, align 4
br label %if.end
if.else:
diff --git a/test/CodeGen/WebAssembly/umulo-i64.ll b/test/CodeGen/WebAssembly/umulo-i64.ll
new file mode 100644
index 000000000000..e47c8aa0bb3a
--- /dev/null
+++ b/test/CodeGen/WebAssembly/umulo-i64.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; Test that UMULO works correctly on 64-bit operands.
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-emscripten"
+
+; CHECK-LABEL: _ZN4core3num21_$LT$impl$u20$u64$GT$15overflowing_mul17h07be88b4cbac028fE:
+; CHECK: __multi3
+; Function Attrs: inlinehint
+define void @"_ZN4core3num21_$LT$impl$u20$u64$GT$15overflowing_mul17h07be88b4cbac028fE"(i64, i64) unnamed_addr #0 {
+start:
+ %2 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %0, i64 %1)
+ %3 = extractvalue { i64, i1 } %2, 0
+ store i64 %3, i64* undef
+ unreachable
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) #1
+
+attributes #0 = { inlinehint }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/X86/2012-08-16-setcc.ll b/test/CodeGen/X86/2012-08-16-setcc.ll
index c03b923cadba..cba208e62a14 100644
--- a/test/CodeGen/X86/2012-08-16-setcc.ll
+++ b/test/CodeGen/X86/2012-08-16-setcc.ll
@@ -1,45 +1,53 @@
-; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
; rdar://12081007
-; CHECK-LABEL: and_1:
-; CHECK: andb
-; CHECK-NEXT: cmovnel
-; CHECK: ret
define i32 @and_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
+; CHECK-LABEL: and_1:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: andb %dil, %sil
+; CHECK-NEXT: cmovnel %edx, %eax
+; CHECK-NEXT: retq
%1 = and i8 %b, %a
%2 = icmp ne i8 %1, 0
%3 = select i1 %2, i32 %x, i32 0
ret i32 %3
}
-; CHECK-LABEL: and_2:
-; CHECK: andb
-; CHECK-NEXT: setne
-; CHECK: ret
define zeroext i1 @and_2(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: and_2:
+; CHECK: # BB#0:
+; CHECK-NEXT: andb %dil, %sil
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: retq
%1 = and i8 %b, %a
%2 = icmp ne i8 %1, 0
ret i1 %2
}
-; CHECK-LABEL: xor_1:
-; CHECK: xorb
-; CHECK-NEXT: cmovnel
-; CHECK: ret
define i32 @xor_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
+; CHECK-LABEL: xor_1:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: xorb %dil, %sil
+; CHECK-NEXT: cmovnel %edx, %eax
+; CHECK-NEXT: retq
%1 = xor i8 %b, %a
%2 = icmp ne i8 %1, 0
%3 = select i1 %2, i32 %x, i32 0
ret i32 %3
}
-; CHECK-LABEL: xor_2:
-; CHECK: xorb
-; CHECK-NEXT: setne
-; CHECK: ret
define zeroext i1 @xor_2(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: xor_2:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorb %dil, %sil
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: retq
%1 = xor i8 %b, %a
%2 = icmp ne i8 %1, 0
ret i1 %2
}
+
diff --git a/test/CodeGen/X86/GC/badreadproto.ll b/test/CodeGen/X86/GC/badreadproto.ll
index 37672f804357..aad79d75218a 100644
--- a/test/CodeGen/X86/GC/badreadproto.ll
+++ b/test/CodeGen/X86/GC/badreadproto.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
%list = type { i32, %list* }
diff --git a/test/CodeGen/X86/GC/badrootproto.ll b/test/CodeGen/X86/GC/badrootproto.ll
index ff86d03c646a..37a3451c2c17 100644
--- a/test/CodeGen/X86/GC/badrootproto.ll
+++ b/test/CodeGen/X86/GC/badrootproto.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
%list = type { i32, %list* }
%meta = type opaque
diff --git a/test/CodeGen/X86/GC/badwriteproto.ll b/test/CodeGen/X86/GC/badwriteproto.ll
index 2544e40f81ff..62c157477635 100644
--- a/test/CodeGen/X86/GC/badwriteproto.ll
+++ b/test/CodeGen/X86/GC/badwriteproto.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
%list = type { i32, %list* }
diff --git a/test/CodeGen/X86/GC/fat.ll b/test/CodeGen/X86/GC/fat.ll
index d05ca3da8195..316a80343e2f 100644
--- a/test/CodeGen/X86/GC/fat.ll
+++ b/test/CodeGen/X86/GC/fat.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
declare void @llvm.gcroot(i8**, i8*) nounwind
diff --git a/test/CodeGen/X86/GC/outside.ll b/test/CodeGen/X86/GC/outside.ll
index 2968c6917ce1..55eda5453789 100644
--- a/test/CodeGen/X86/GC/outside.ll
+++ b/test/CodeGen/X86/GC/outside.ll
@@ -1,4 +1,4 @@
-; RUN: not llvm-as < %s >& /dev/null
+; RUN: not llvm-as < %s > /dev/null 2>&1
declare void @llvm.gcroot(i8**, i8*)
diff --git a/test/CodeGen/X86/GlobalISel/GV.ll b/test/CodeGen/X86/GlobalISel/GV.ll
new file mode 100644
index 000000000000..44862ab5a96e
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/GV.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64
+; RUN: llc -mtriple=x86_64-apple-darwin -global-isel -verify-machineinstrs -relocation-model=pic < %s -o - | FileCheck %s --check-prefix=X64_DARWIN_PIC
+; RUN: llc -mtriple=i386-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X32
+; RUN: llc -mtriple=x86_64-linux-gnux32 -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X32ABI
+
+@g_int = global i32 0, align 4
+
+; Function Attrs: noinline nounwind optnone uwtable
+define i32* @test_global_ptrv() #3 {
+; X64-LABEL: test_global_ptrv:
+; X64: # BB#0: # %entry
+; X64-NEXT: leaq g_int, %rax
+; X64-NEXT: retq
+;
+; X64_DARWIN_PIC-LABEL: test_global_ptrv:
+; X64_DARWIN_PIC: ## BB#0: ## %entry
+; X64_DARWIN_PIC-NEXT: leaq _g_int(%rip), %rax
+; X64_DARWIN_PIC-NEXT: retq
+;
+; X32-LABEL: test_global_ptrv:
+; X32: # BB#0: # %entry
+; X32-NEXT: leal g_int, %eax
+; X32-NEXT: retl
+;
+; X32ABI-LABEL: test_global_ptrv:
+; X32ABI: # BB#0: # %entry
+; X32ABI-NEXT: leal g_int, %eax
+; X32ABI-NEXT: retq
+entry:
+ ret i32* @g_int
+}
+
+; Function Attrs: noinline nounwind optnone uwtable
+define i32 @test_global_valv() #3 {
+; X64-LABEL: test_global_valv:
+; X64: # BB#0: # %entry
+; X64-NEXT: leaq g_int, %rax
+; X64-NEXT: movl (%rax), %eax
+; X64-NEXT: retq
+;
+; X64_DARWIN_PIC-LABEL: test_global_valv:
+; X64_DARWIN_PIC: ## BB#0: ## %entry
+; X64_DARWIN_PIC-NEXT: leaq _g_int(%rip), %rax
+; X64_DARWIN_PIC-NEXT: movl (%rax), %eax
+; X64_DARWIN_PIC-NEXT: retq
+;
+; X32-LABEL: test_global_valv:
+; X32: # BB#0: # %entry
+; X32-NEXT: leal g_int, %eax
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: retl
+;
+; X32ABI-LABEL: test_global_valv:
+; X32ABI: # BB#0: # %entry
+; X32ABI-NEXT: leal g_int, %eax
+; X32ABI-NEXT: movl (%eax), %eax
+; X32ABI-NEXT: retq
+entry:
+ %0 = load i32, i32* @g_int, align 4
+ ret i32 %0
+}
+
diff --git a/test/CodeGen/X86/GlobalISel/add-vec.ll b/test/CodeGen/X86/GlobalISel/add-vec.ll
index 679a49d733a2..0ea1cf820c0f 100644
--- a/test/CodeGen/X86/GlobalISel/add-vec.ll
+++ b/test/CodeGen/X86/GlobalISel/add-vec.ll
@@ -1,38 +1,41 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=SKX
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=core-avx2 -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=corei7-avx -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
+
define <16 x i8> @test_add_v16i8(<16 x i8> %arg1, <16 x i8> %arg2) {
-; SKX-LABEL: test_add_v16i8:
-; SKX: # BB#0:
-; SKX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; SKX-NEXT: retq
+; ALL-LABEL: test_add_v16i8:
+; ALL: # BB#0:
+; ALL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; ALL-NEXT: retq
%ret = add <16 x i8> %arg1, %arg2
ret <16 x i8> %ret
}
define <8 x i16> @test_add_v8i16(<8 x i16> %arg1, <8 x i16> %arg2) {
-; SKX-LABEL: test_add_v8i16:
-; SKX: # BB#0:
-; SKX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; SKX-NEXT: retq
+; ALL-LABEL: test_add_v8i16:
+; ALL: # BB#0:
+; ALL-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; ALL-NEXT: retq
%ret = add <8 x i16> %arg1, %arg2
ret <8 x i16> %ret
}
define <4 x i32> @test_add_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) {
-; SKX-LABEL: test_add_v4i32:
-; SKX: # BB#0:
-; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; SKX-NEXT: retq
+; ALL-LABEL: test_add_v4i32:
+; ALL: # BB#0:
+; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; ALL-NEXT: retq
%ret = add <4 x i32> %arg1, %arg2
ret <4 x i32> %ret
}
define <2 x i64> @test_add_v2i64(<2 x i64> %arg1, <2 x i64> %arg2) {
-; SKX-LABEL: test_add_v2i64:
-; SKX: # BB#0:
-; SKX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; SKX-NEXT: retq
+; ALL-LABEL: test_add_v2i64:
+; ALL: # BB#0:
+; ALL-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; ALL-NEXT: retq
%ret = add <2 x i64> %arg1, %arg2
ret <2 x i64> %ret
}
@@ -42,6 +45,20 @@ define <32 x i8> @test_add_v32i8(<32 x i8> %arg1, <32 x i8> %arg2) {
; SKX: # BB#0:
; SKX-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
+;
+; AVX2-LABEL: test_add_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX1-LABEL: test_add_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
%ret = add <32 x i8> %arg1, %arg2
ret <32 x i8> %ret
}
@@ -51,6 +68,20 @@ define <16 x i16> @test_add_v16i16(<16 x i16> %arg1, <16 x i16> %arg2) {
; SKX: # BB#0:
; SKX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
+;
+; AVX2-LABEL: test_add_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX1-LABEL: test_add_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
%ret = add <16 x i16> %arg1, %arg2
ret <16 x i16> %ret
}
@@ -60,6 +91,20 @@ define <8 x i32> @test_add_v8i32(<8 x i32> %arg1, <8 x i32> %arg2) {
; SKX: # BB#0:
; SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
+;
+; AVX2-LABEL: test_add_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX1-LABEL: test_add_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
%ret = add <8 x i32> %arg1, %arg2
ret <8 x i32> %ret
}
@@ -69,6 +114,20 @@ define <4 x i64> @test_add_v4i64(<4 x i64> %arg1, <4 x i64> %arg2) {
; SKX: # BB#0:
; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
+;
+; AVX2-LABEL: test_add_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX1-LABEL: test_add_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
%ret = add <4 x i64> %arg1, %arg2
ret <4 x i64> %ret
}
@@ -78,6 +137,26 @@ define <64 x i8> @test_add_v64i8(<64 x i8> %arg1, <64 x i8> %arg2) {
; SKX: # BB#0:
; SKX-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
+;
+; AVX2-LABEL: test_add_v64i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX1-LABEL: test_add_v64i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vpaddb %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; AVX1-NEXT: retq
%ret = add <64 x i8> %arg1, %arg2
ret <64 x i8> %ret
}
@@ -87,6 +166,26 @@ define <32 x i16> @test_add_v32i16(<32 x i16> %arg1, <32 x i16> %arg2) {
; SKX: # BB#0:
; SKX-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
+;
+; AVX2-LABEL: test_add_v32i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX1-LABEL: test_add_v32i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vpaddw %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT: vpaddw %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; AVX1-NEXT: retq
%ret = add <32 x i16> %arg1, %arg2
ret <32 x i16> %ret
}
@@ -96,6 +195,26 @@ define <16 x i32> @test_add_v16i32(<16 x i32> %arg1, <16 x i32> %arg2) {
; SKX: # BB#0:
; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
+;
+; AVX2-LABEL: test_add_v16i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX1-LABEL: test_add_v16i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; AVX1-NEXT: retq
%ret = add <16 x i32> %arg1, %arg2
ret <16 x i32> %ret
}
@@ -105,6 +224,26 @@ define <8 x i64> @test_add_v8i64(<8 x i64> %arg1, <8 x i64> %arg2) {
; SKX: # BB#0:
; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
+;
+; AVX2-LABEL: test_add_v8i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX1-LABEL: test_add_v8i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; AVX1-NEXT: retq
%ret = add <8 x i64> %arg1, %arg2
ret <8 x i64> %ret
}
diff --git a/test/CodeGen/X86/GlobalISel/constant.ll b/test/CodeGen/X86/GlobalISel/constant.ll
index b550bb0bc7be..5b512f9ce937 100644
--- a/test/CodeGen/X86/GlobalISel/constant.ll
+++ b/test/CodeGen/X86/GlobalISel/constant.ll
@@ -51,4 +51,13 @@ define i64 @const_i64_i32() {
ret i64 -1
}
+define void @main(i32 ** %data) {
+; ALL-LABEL: main:
+; ALL: # BB#0:
+; ALL-NEXT: movq $0, %rax
+; ALL-NEXT: movq %rax, (%rdi)
+; ALL-NEXT: retq
+ store i32* null, i32** %data, align 8
+ ret void
+}
diff --git a/test/CodeGen/X86/GlobalISel/ext-x86-64.ll b/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
index b08ac062fb4b..11b03bd56110 100644
--- a/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
+++ b/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64
-; TODO merge with ext.ll after i64 sext suported on 32bit platform
+; TODO merge with ext.ll after i64 sext supported on 32bit platform
define i64 @test_zext_i1(i8 %a) {
; X64-LABEL: test_zext_i1:
diff --git a/test/CodeGen/X86/GlobalISel/ext.ll b/test/CodeGen/X86/GlobalISel/ext.ll
index 392c973c1208..d9a09678cf4b 100644
--- a/test/CodeGen/X86/GlobalISel/ext.ll
+++ b/test/CodeGen/X86/GlobalISel/ext.ll
@@ -2,6 +2,42 @@
; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64
; RUN: llc -mtriple=i386-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X32
+define i8 @test_zext_i1toi8(i32 %a) {
+; X64-LABEL: test_zext_i1toi8:
+; X64: # BB#0:
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_zext_i1toi8:
+; X32: # BB#0:
+; X32-NEXT: movl 4(%esp), %eax
+; X32-NEXT: andb $1, %al
+; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: retl
+ %val = trunc i32 %a to i1
+ %r = zext i1 %val to i8
+ ret i8 %r
+}
+
+define i16 @test_zext_i1toi16(i32 %a) {
+; X64-LABEL: test_zext_i1toi16:
+; X64: # BB#0:
+; X64-NEXT: andw $1, %di
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_zext_i1toi16:
+; X32: # BB#0:
+; X32-NEXT: movl 4(%esp), %eax
+; X32-NEXT: andw $1, %ax
+; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: retl
+ %val = trunc i32 %a to i1
+ %r = zext i1 %val to i16
+ ret i16 %r
+}
+
define i32 @test_zext_i1(i32 %a) {
; X64-LABEL: test_zext_i1:
; X64: # BB#0:
diff --git a/test/CodeGen/X86/GlobalISel/legalize-GV.mir b/test/CodeGen/X86/GlobalISel/legalize-GV.mir
new file mode 100644
index 000000000000..7f9971e4c70a
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/legalize-GV.mir
@@ -0,0 +1,31 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
+--- |
+
+ @g_int = global i32 0, align 4
+
+ define i32* @test_global_ptrv() {
+ entry:
+ ret i32* @g_int
+ }
+...
+---
+name: test_global_ptrv
+# ALL-LABEL: name: test_global_ptrv
+alignment: 4
+legalized: false
+regBankSelected: false
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: _, preferred-register: '' }
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+# ALL: %0(p0) = G_GLOBAL_VALUE @g_int
+# ALL-NEXT: %rax = COPY %0(p0)
+# ALL-NEXT: RET 0, implicit %rax
+body: |
+ bb.1.entry:
+ %0(p0) = G_GLOBAL_VALUE @g_int
+ %rax = COPY %0(p0)
+ RET 0, implicit %rax
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-ext.mir b/test/CodeGen/X86/GlobalISel/legalize-ext.mir
index c9add0dc4e95..c86bfd9ee96d 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-ext.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-ext.mir
@@ -1,12 +1,28 @@
# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
--- |
- define i32 @test_zext_i1(i8 %a) {
+
+ define i8 @test_zext_i1toi8(i1 %a) {
+ %r = zext i1 %a to i8
+ ret i8 %r
+ }
+
+ define i16 @test_zext_i1toi16(i1 %a) {
+ %r = zext i1 %a to i16
+ ret i16 %r
+ }
+
+ define i32 @test_zext_i1(i8 %a) {
%val = trunc i8 %a to i1
%r = zext i1 %val to i32
ret i32 %r
}
+ define i16 @test_zext_i8toi16(i8 %val) {
+ %r = zext i8 %val to i16
+ ret i16 %r
+ }
+
define i32 @test_zext_i8(i8 %val) {
%r = zext i8 %val to i32
ret i32 %r
@@ -17,12 +33,27 @@
ret i32 %r
}
+ define i8 @test_sext_i1toi8(i1 %a) {
+ %r = sext i1 %a to i8
+ ret i8 %r
+ }
+
+ define i16 @test_sext_i1toi16(i1 %a) {
+ %r = sext i1 %a to i16
+ ret i16 %r
+ }
+
define i32 @test_sext_i1(i8 %a) {
%val = trunc i8 %a to i1
%r = sext i1 %val to i32
ret i32 %r
}
+ define i16 @test_sext_i8toi16(i8 %val) {
+ %r = sext i8 %val to i16
+ ret i16 %r
+ }
+
define i32 @test_sext_i8(i8 %val) {
%r = sext i8 %val to i32
ret i32 %r
@@ -35,6 +66,52 @@
...
---
+name: test_zext_i1toi8
+# ALL-LABEL: name: test_zext_i1toi8
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+# ALL: %0(s1) = COPY %edi
+# ALL-NEXT: %1(s8) = G_ZEXT %0(s1)
+# ALL-NEXT: %al = COPY %1(s8)
+# ALL-NEXT: RET 0, implicit %al
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s1) = COPY %edi
+ %1(s8) = G_ZEXT %0(s1)
+ %al = COPY %1(s8)
+ RET 0, implicit %al
+
+...
+---
+name: test_zext_i1toi16
+# ALL-LABEL: name: test_zext_i1toi16
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+# ALL: %0(s1) = COPY %edi
+# ALL-NEXT: %1(s16) = G_ZEXT %0(s1)
+# ALL-NEXT: %ax = COPY %1(s16)
+# ALL-NEXT: RET 0, implicit %ax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s1) = COPY %edi
+ %1(s16) = G_ZEXT %0(s1)
+ %ax = COPY %1(s16)
+ RET 0, implicit %ax
+
+...
+---
name: test_zext_i1
# ALL-LABEL: name: test_zext_i1
alignment: 4
@@ -61,6 +138,29 @@ body: |
...
---
+name: test_zext_i8toi16
+# ALL-LABEL: name: test_zext_i8toi16
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+# ALL: %0(s8) = COPY %edi
+# ALL-NEXT: %1(s16) = G_ZEXT %0(s8)
+# ALL-NEXT: %ax = COPY %1(s16)
+# ALL-NEXT: RET 0, implicit %ax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s8) = COPY %edi
+ %1(s16) = G_ZEXT %0(s8)
+ %ax = COPY %1(s16)
+ RET 0, implicit %ax
+
+...
+---
name: test_zext_i8
# ALL-LABEL: name: test_zext_i8
alignment: 4
@@ -107,6 +207,52 @@ body: |
...
---
+name: test_sext_i1toi8
+# ALL-LABEL: name: test_sext_i1toi8
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+# ALL: %0(s1) = COPY %edi
+# ALL-NEXT: %1(s8) = G_SEXT %0(s1)
+# ALL-NEXT: %al = COPY %1(s8)
+# ALL-NEXT: RET 0, implicit %al
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s1) = COPY %edi
+ %1(s8) = G_SEXT %0(s1)
+ %al = COPY %1(s8)
+ RET 0, implicit %al
+
+...
+---
+name: test_sext_i1toi16
+# ALL-LABEL: name: test_sext_i1toi16
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+# ALL: %0(s1) = COPY %edi
+# ALL-NEXT: %1(s16) = G_SEXT %0(s1)
+# ALL-NEXT: %ax = COPY %1(s16)
+# ALL-NEXT: RET 0, implicit %ax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s1) = COPY %edi
+ %1(s16) = G_SEXT %0(s1)
+ %ax = COPY %1(s16)
+ RET 0, implicit %ax
+
+...
+---
name: test_sext_i1
# ALL-LABEL: name: test_sext_i1
alignment: 4
@@ -133,6 +279,29 @@ body: |
...
---
+name: test_sext_i8toi16
+# ALL-LABEL: name: test_sext_i8toi16
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+# ALL: %0(s8) = COPY %edi
+# ALL-NEXT: %1(s16) = G_SEXT %0(s8)
+# ALL-NEXT: %ax = COPY %1(s16)
+# ALL-NEXT: RET 0, implicit %ax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s8) = COPY %edi
+ %1(s16) = G_SEXT %0(s8)
+ %ax = COPY %1(s16)
+ RET 0, implicit %ax
+
+...
+---
name: test_sext_i8
# ALL-LABEL: name: test_sext_i8
alignment: 4
diff --git a/test/CodeGen/X86/GlobalISel/legalize-memop-scalar.mir b/test/CodeGen/X86/GlobalISel/legalize-memop-scalar.mir
new file mode 100644
index 000000000000..60d9fc63c14a
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/legalize-memop-scalar.mir
@@ -0,0 +1,110 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
+
+--- |
+ define void @test_memop_s8tos32() {
+ ret void
+ }
+
+ define void @test_memop_s64() {
+ ret void
+ }
+...
+---
+name: test_memop_s8tos32
+# ALL-LABEL: name: test_memop_s8tos32
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+ - { id: 3, class: _, preferred-register: '' }
+ - { id: 4, class: _, preferred-register: '' }
+ - { id: 5, class: _, preferred-register: '' }
+ - { id: 6, class: _, preferred-register: '' }
+ - { id: 7, class: _, preferred-register: '' }
+ - { id: 8, class: _, preferred-register: '' }
+ - { id: 9, class: _, preferred-register: '' }
+ - { id: 10, class: _, preferred-register: '' }
+# ALL: %0(p0) = IMPLICIT_DEF
+# ALL-NEXT: %11(s8) = G_LOAD %0(p0) :: (load 1)
+# ALL-NEXT: %9(s1) = G_TRUNC %11(s8)
+# ALL-NEXT: %1(s8) = G_LOAD %0(p0) :: (load 1)
+# ALL-NEXT: %2(s16) = G_LOAD %0(p0) :: (load 2)
+# ALL-NEXT: %3(s32) = G_LOAD %0(p0) :: (load 4)
+# ALL-NEXT: %4(p0) = G_LOAD %0(p0) :: (load 8)
+# ALL-NEXT: %10(s1) = IMPLICIT_DEF
+# ALL-NEXT: %12(s8) = G_ZEXT %10(s1)
+# ALL-NEXT: G_STORE %12(s8), %0(p0) :: (store 1)
+# ALL-NEXT: %5(s8) = IMPLICIT_DEF
+# ALL-NEXT: G_STORE %5(s8), %0(p0) :: (store 1)
+# ALL-NEXT: %6(s16) = IMPLICIT_DEF
+# ALL-NEXT: G_STORE %6(s16), %0(p0) :: (store 2)
+# ALL-NEXT: %7(s32) = IMPLICIT_DEF
+# ALL-NEXT: G_STORE %7(s32), %0(p0) :: (store 4)
+# ALL-NEXT: %8(p0) = IMPLICIT_DEF
+# ALL-NEXT: G_STORE %8(p0), %0(p0) :: (store 8)
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %rdi
+
+ %0(p0) = IMPLICIT_DEF
+ %9(s1) = G_LOAD %0(p0) :: (load 1)
+ %1(s8) = G_LOAD %0(p0) :: (load 1)
+ %2(s16) = G_LOAD %0(p0) :: (load 2)
+ %3(s32) = G_LOAD %0(p0) :: (load 4)
+ %4(p0) = G_LOAD %0(p0) :: (load 8)
+
+ %10(s1) = IMPLICIT_DEF
+ G_STORE %10, %0 :: (store 1)
+ %5(s8) = IMPLICIT_DEF
+ G_STORE %5, %0 :: (store 1)
+ %6(s16) = IMPLICIT_DEF
+ G_STORE %6, %0 :: (store 2)
+ %7(s32) = IMPLICIT_DEF
+ G_STORE %7, %0 :: (store 4)
+ %8(p0) = IMPLICIT_DEF
+ G_STORE %8, %0 :: (store 8)
+...
+---
+name: test_memop_s64
+# ALL-LABEL: name: test_memop_s64
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+liveins:
+# X64: %0(p0) = IMPLICIT_DEF
+# X64-NEXT: %1(s64) = G_LOAD %0(p0) :: (load 8)
+# X64-NEXT: %2(s64) = IMPLICIT_DEF
+# X64-NEXT: G_STORE %2(s64), %0(p0) :: (store 8)
+#
+# X32: %0(p0) = IMPLICIT_DEF
+# X32-NEXT: %3(s32) = G_LOAD %0(p0) :: (load 8)
+# X32-NEXT: %6(s32) = G_CONSTANT i32 4
+# X32-NEXT: %5(p0) = G_GEP %0, %6(s32)
+# X32-NEXT: %4(s32) = G_LOAD %5(p0) :: (load 8)
+# X32-NEXT: %1(s64) = G_MERGE_VALUES %3(s32), %4(s32)
+# X32-NEXT: %2(s64) = IMPLICIT_DEF
+# X32-NEXT: %7(s32), %8(s32) = G_UNMERGE_VALUES %2(s64)
+# X32-NEXT: G_STORE %7(s32), %0(p0) :: (store 8)
+# X32-NEXT: %10(s32) = G_CONSTANT i32 4
+# X32-NEXT: %9(p0) = G_GEP %0, %10(s32)
+# X32-NEXT: G_STORE %8(s32), %9(p0) :: (store 8)
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %rdi
+
+ %0(p0) = IMPLICIT_DEF
+ %1(s64) = G_LOAD %0(p0) :: (load 8)
+
+ %2(s64) = IMPLICIT_DEF
+ G_STORE %2, %0 :: (store 8)
+
+...
+
diff --git a/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll b/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll
index 2757e6493258..1c719b1bf74d 100644
--- a/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll
+++ b/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll
@@ -4,6 +4,16 @@
;TODO merge with x86-64 tests (many operations not suppored yet)
+define i1 @test_load_i1(i1 * %p1) {
+; ALL-LABEL: test_load_i1:
+; ALL: # BB#0:
+; ALL-NEXT: movl 4(%esp), %eax
+; ALL-NEXT: movb (%eax), %al
+; ALL-NEXT: retl
+ %r = load i1, i1* %p1
+ ret i1 %r
+}
+
define i8 @test_load_i8(i8 * %p1) {
; ALL-LABEL: test_load_i8:
; ALL: # BB#0:
@@ -34,6 +44,18 @@ define i32 @test_load_i32(i32 * %p1) {
ret i32 %r
}
+define i1 * @test_store_i1(i1 %val, i1 * %p1) {
+; ALL-LABEL: test_store_i1:
+; ALL: # BB#0:
+; ALL-NEXT: movb 4(%esp), %cl
+; ALL-NEXT: movl 8(%esp), %eax
+; ALL-NEXT: andb $1, %cl
+; ALL-NEXT: movb %cl, (%eax)
+; ALL-NEXT: retl
+ store i1 %val, i1* %p1
+ ret i1 * %p1;
+}
+
define i8 * @test_store_i8(i8 %val, i8 * %p1) {
; ALL-LABEL: test_store_i8:
; ALL: # BB#0:
diff --git a/test/CodeGen/X86/GlobalISel/memop-scalar.ll b/test/CodeGen/X86/GlobalISel/memop-scalar.ll
index 2e04b3cf20b3..2097a3b0bfc9 100644
--- a/test/CodeGen/X86/GlobalISel/memop-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/memop-scalar.ll
@@ -2,6 +2,15 @@
; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE_FAST
; RUN: llc -mtriple=x86_64-linux-gnu -regbankselect-greedy -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE_GREEDY
+define i1 @test_load_i1(i1 * %p1) {
+; ALL-LABEL: test_load_i1:
+; ALL: # BB#0:
+; ALL-NEXT: movb (%rdi), %al
+; ALL-NEXT: retq
+ %r = load i1, i1* %p1
+ ret i1 %r
+}
+
define i8 @test_load_i8(i8 * %p1) {
; ALL-LABEL: test_load_i8:
; ALL: # BB#0:
@@ -70,6 +79,17 @@ define double @test_load_double(double * %p1) {
ret double %r
}
+define i1 * @test_store_i1(i1 %val, i1 * %p1) {
+; ALL-LABEL: test_store_i1:
+; ALL: # BB#0:
+; ALL-NEXT: andb $1, %dil
+; ALL-NEXT: movb %dil, (%rsi)
+; ALL-NEXT: movq %rsi, %rax
+; ALL-NEXT: retq
+ store i1 %val, i1* %p1
+ ret i1 * %p1;
+}
+
define i32 * @test_store_i32(i32 %val, i32 * %p1) {
; ALL-LABEL: test_store_i32:
; ALL: # BB#0:
diff --git a/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir b/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir
index 3658bc9af957..95ef15ceb689 100644
--- a/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir
+++ b/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir
@@ -174,6 +174,13 @@
ret i64 %ret
}
+ @g_int = global i32 0, align 4
+
+ define i32* @test_global_ptrv() {
+ entry:
+ ret i32* @g_int
+ }
+
...
---
name: test_add_i8
@@ -1084,4 +1091,24 @@ body: |
RET 0, implicit %rax
...
+---
+name: test_global_ptrv
+# CHECK-LABEL: name: test_global_ptrv
+alignment: 4
+legalized: true
+regBankSelected: false
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' }
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+# CHECK: %0(p0) = G_GLOBAL_VALUE @g_int
+# CHECK-NEXT: %rax = COPY %0(p0)
+# CHECK-NEXT: RET 0, implicit %rax
+body: |
+ bb.1.entry:
+ %0(p0) = G_GLOBAL_VALUE @g_int
+ %rax = COPY %0(p0)
+ RET 0, implicit %rax
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-GV.mir b/test/CodeGen/X86/GlobalISel/select-GV.mir
new file mode 100644
index 000000000000..2f2fd51d99d1
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-GV.mir
@@ -0,0 +1,99 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64ALL --check-prefix=X64
+# RUN: llc -mtriple=x86_64-apple-darwin -relocation-model=pic -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64ALL --check-prefix=X64_DARWIN_PIC
+# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32ALL --check-prefix=X32
+# RUN: llc -mtriple=x86_64-linux-gnux32 -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32ALL --check-prefix=X32ABI
+
+--- |
+
+ @g_int = global i32 0, align 4
+
+ define i32* @test_global_ptrv() {
+ entry:
+ ret i32* @g_int
+ }
+
+ define i32 @test_global_valv() {
+ entry:
+ %0 = load i32, i32* @g_int, align 4
+ ret i32 %0
+ }
+
+...
+---
+name: test_global_ptrv
+# CHECK-LABEL: name: test_global_ptrv
+alignment: 4
+legalized: true
+regBankSelected: true
+# X64ALL: registers:
+# X64ALL-NEXT: - { id: 0, class: gr64, preferred-register: '' }
+#
+# X32ALL: registers:
+# X32ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+# X64: %0 = LEA64r _, 1, _, @g_int, _
+# X64-NEXT: %rax = COPY %0
+# X64-NEXT: RET 0, implicit %rax
+#
+# X64_DARWIN_PIC: %0 = LEA64r %rip, 1, _, @g_int, _
+# X64_DARWIN_PIC-NEXT: %rax = COPY %0
+# X64_DARWIN_PIC-NEXT: RET 0, implicit %rax
+#
+# X32: %0 = LEA32r _, 1, _, @g_int, _
+# X32-NEXT: %rax = COPY %0
+# X32-NEXT: RET 0, implicit %rax
+#
+# X32ABI: %0 = LEA64_32r _, 1, _, @g_int, _
+# X32ABI-NEXT: %rax = COPY %0
+# X32ABI-NEXT: RET 0, implicit %rax
+body: |
+ bb.1.entry:
+ %0(p0) = G_GLOBAL_VALUE @g_int
+ %rax = COPY %0(p0)
+ RET 0, implicit %rax
+
+...
+---
+name: test_global_valv
+# CHECK-LABEL: name: test_global_valv
+alignment: 4
+legalized: true
+regBankSelected: true
+# X64ALL: registers:
+# X64ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+# X64ALL-NEXT: - { id: 1, class: gr64, preferred-register: '' }
+#
+# X32ALL: registers:
+# X32ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+# X32ALL-NEXT: - { id: 1, class: gr32, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+ - { id: 1, class: gpr, preferred-register: '' }
+# X64: %1 = LEA64r _, 1, _, @g_int, _
+# X64-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (load 4 from @g_int)
+# X64-NEXT: %eax = COPY %0
+# X64-NEXT: RET 0, implicit %eax
+#
+# X64_DARWIN_PIC: %1 = LEA64r %rip, 1, _, @g_int, _
+# X64_DARWIN_PIC-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (load 4 from @g_int)
+# X64_DARWIN_PIC-NEXT: %eax = COPY %0
+# X64_DARWIN_PIC-NEXT: RET 0, implicit %eax
+#
+# X32: %1 = LEA32r _, 1, _, @g_int, _
+# X32-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (load 4 from @g_int)
+# X32-NEXT: %eax = COPY %0
+# X32-NEXT: RET 0, implicit %eax
+#
+# X32ABI: %1 = LEA64_32r _, 1, _, @g_int, _
+# X32ABI-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (load 4 from @g_int)
+# X32ABI-NEXT: %eax = COPY %0
+# X32ABI-NEXT: RET 0, implicit %eax
+body: |
+ bb.1.entry:
+ %1(p0) = G_GLOBAL_VALUE @g_int
+ %0(s32) = G_LOAD %1(p0) :: (load 4 from @g_int)
+ %eax = COPY %0(s32)
+ RET 0, implicit %eax
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-constant.mir b/test/CodeGen/X86/GlobalISel/select-constant.mir
index 4b91b5f9f098..30f57418b4ce 100644
--- a/test/CodeGen/X86/GlobalISel/select-constant.mir
+++ b/test/CodeGen/X86/GlobalISel/select-constant.mir
@@ -29,6 +29,11 @@
ret i64 -1
}
+ define void @main(i32** %data) {
+ store i32* null, i32** %data, align 8
+ ret void
+ }
+
...
---
name: const_i8
@@ -162,3 +167,29 @@ body: |
RET 0, implicit %rax
...
+---
+name: main
+# CHECK-LABEL: name: main
+alignment: 4
+legalized: true
+regBankSelected: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gr64, preferred-register: '' }
+# CHECK-NEXT: - { id: 1, class: gr64, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+ - { id: 1, class: gpr, preferred-register: '' }
+# CHECK: %0 = COPY %rdi
+# CHECK-NEXT: %1 = MOV64ri32 0
+# CHECK-NEXT: MOV64mr %0, 1, _, 0, _, %1 :: (store 8 into %ir.data)
+# CHECK-NEXT: RET 0
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %rdi
+
+ %0(p0) = COPY %rdi
+ %1(p0) = G_CONSTANT i64 0
+ G_STORE %1(p0), %0(p0) :: (store 8 into %ir.data)
+ RET 0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-ext.mir b/test/CodeGen/X86/GlobalISel/select-ext.mir
index b52f1f6fa621..b6734e5aa2b8 100644
--- a/test/CodeGen/X86/GlobalISel/select-ext.mir
+++ b/test/CodeGen/X86/GlobalISel/select-ext.mir
@@ -2,6 +2,16 @@
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
--- |
+ define i8 @test_zext_i1toi8(i1 %a) {
+ %r = zext i1 %a to i8
+ ret i8 %r
+ }
+
+ define i16 @test_zext_i1toi16(i1 %a) {
+ %r = zext i1 %a to i16
+ ret i16 %r
+ }
+
define i32 @test_zext_i1(i1 %a) {
%r = zext i1 %a to i32
ret i32 %r
@@ -29,6 +39,60 @@
...
---
+name: test_zext_i1toi8
+# ALL-LABEL: name: test_zext_i1toi8
+alignment: 4
+legalized: true
+regBankSelected: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: gr8, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+ - { id: 1, class: gpr, preferred-register: '' }
+# ALL: %0 = COPY %dil
+# ALL-NEXT: %1 = AND8ri %0, 1, implicit-def %eflags
+# ALL-NEXT: %al = COPY %1
+# ALL-NEXT: RET 0, implicit %al
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s1) = COPY %edi
+ %1(s8) = G_ZEXT %0(s1)
+ %al = COPY %1(s8)
+ RET 0, implicit %al
+
+...
+---
+name: test_zext_i1toi16
+# ALL-LABEL: name: test_zext_i1toi16
+alignment: 4
+legalized: true
+regBankSelected: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '' }
+# ALL-NEXT: - { id: 2, class: gr16, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+ - { id: 1, class: gpr, preferred-register: '' }
+# ALL: %0 = COPY %dil
+# ALL-NEXT: %2 = SUBREG_TO_REG 0, %0, 1
+# ALL-NEXT: %1 = AND16ri8 %2, 1, implicit-def %eflags
+# ALL-NEXT: %ax = COPY %1
+# ALL-NEXT: RET 0, implicit %ax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s1) = COPY %edi
+ %1(s16) = G_ZEXT %0(s1)
+ %ax = COPY %1(s16)
+ RET 0, implicit %ax
+
+...
+---
name: test_zext_i1
# ALL-LABEL: name: test_zext_i1
alignment: 4
diff --git a/test/CodeGen/X86/GlobalISel/select-unmerge-vec256.mir b/test/CodeGen/X86/GlobalISel/select-unmerge-vec256.mir
new file mode 100644
index 000000000000..09dc5344796f
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-unmerge-vec256.mir
@@ -0,0 +1,53 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=AVX
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f,+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=AVX512VL
+--- |
+ define void @test_unmerge() {
+ ret void
+ }
+
+...
+---
+name: test_unmerge
+# AVX-LABEL: name: test_unmerge
+#
+# AVX512VL-LABEL: name: test_unmerge
+alignment: 4
+legalized: true
+regBankSelected: true
+# AVX: registers:
+# AVX-NEXT: - { id: 0, class: vr256, preferred-register: '' }
+# AVX-NEXT: - { id: 1, class: vr128, preferred-register: '' }
+# AVX-NEXT: - { id: 2, class: vr128, preferred-register: '' }
+#
+# AVX512VL: registers:
+# AVX512VL-NEXT: - { id: 0, class: vr256x, preferred-register: '' }
+# AVX512VL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
+# AVX512VL-NEXT: - { id: 2, class: vr128x, preferred-register: '' }
+registers:
+ - { id: 0, class: vecr }
+ - { id: 1, class: vecr }
+ - { id: 2, class: vecr }
+# AVX: %0 = IMPLICIT_DEF
+# AVX-NEXT: %1 = COPY %0.sub_xmm
+# AVX-NEXT: %2 = VEXTRACTF128rr %0, 1
+# AVX-NEXT: %xmm0 = COPY %1
+# AVX-NEXT: %xmm1 = COPY %2
+# AVX-NEXT: RET 0, implicit %xmm0, implicit %xmm1
+#
+# AVX512VL: %0 = IMPLICIT_DEF
+# AVX512VL-NEXT: %1 = COPY %0.sub_xmm
+# AVX512VL-NEXT: %2 = VEXTRACTF32x4Z256rr %0, 1
+# AVX512VL-NEXT: %xmm0 = COPY %1
+# AVX512VL-NEXT: %xmm1 = COPY %2
+# AVX512VL-NEXT: RET 0, implicit %xmm0, implicit %xmm1
+body: |
+ bb.1 (%ir-block.0):
+
+ %0(<8 x s32>) = IMPLICIT_DEF
+ %1(<4 x s32>), %2(<4 x s32>) = G_UNMERGE_VALUES %0(<8 x s32>)
+ %xmm0 = COPY %1(<4 x s32>)
+ %xmm1 = COPY %2(<4 x s32>)
+ RET 0, implicit %xmm0, implicit %xmm1
+
+...
+
diff --git a/test/CodeGen/X86/GlobalISel/select-unmerge-vec512.mir b/test/CodeGen/X86/GlobalISel/select-unmerge-vec512.mir
new file mode 100644
index 000000000000..a63733d07f6a
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-unmerge-vec512.mir
@@ -0,0 +1,74 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
+--- |
+ define void @test_unmerge_v128() {
+ ret void
+ }
+
+ define void @test_unmerge_v256() {
+ ret void
+ }
+
+...
+---
+name: test_unmerge_v128
+# ALL-LABEL: name: test_unmerge_v128
+alignment: 4
+legalized: true
+regBankSelected: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
+# ALL-NEXT: - { id: 2, class: vr128x, preferred-register: '' }
+# ALL-NEXT: - { id: 3, class: vr128x, preferred-register: '' }
+# ALL-NEXT: - { id: 4, class: vr128x, preferred-register: '' }
+registers:
+ - { id: 0, class: vecr }
+ - { id: 1, class: vecr }
+ - { id: 2, class: vecr }
+ - { id: 3, class: vecr }
+ - { id: 4, class: vecr }
+# ALL: %0 = IMPLICIT_DEF
+# ALL-NEXT: %1 = COPY %0.sub_xmm
+# ALL-NEXT: %2 = VEXTRACTF32x4Zrr %0, 1
+# ALL-NEXT: %3 = VEXTRACTF32x4Zrr %0, 2
+# ALL-NEXT: %4 = VEXTRACTF32x4Zrr %0, 3
+# ALL-NEXT: %xmm0 = COPY %1
+# ALL-NEXT: RET 0, implicit %xmm0
+body: |
+ bb.1 (%ir-block.0):
+
+ %0(<16 x s32>) = IMPLICIT_DEF
+ %1(<4 x s32>), %2(<4 x s32>), %3(<4 x s32>), %4(<4 x s32>) = G_UNMERGE_VALUES %0(<16 x s32>)
+ %xmm0 = COPY %1(<4 x s32>)
+ RET 0, implicit %xmm0
+
+...
+---
+name: test_unmerge_v256
+# ALL-LABEL: name: test_unmerge_v256
+alignment: 4
+legalized: true
+regBankSelected: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: vr256x, preferred-register: '' }
+# ALL-NEXT: - { id: 2, class: vr256x, preferred-register: '' }
+registers:
+ - { id: 0, class: vecr }
+ - { id: 1, class: vecr }
+ - { id: 2, class: vecr }
+# ALL: %0 = IMPLICIT_DEF
+# ALL-NEXT: %1 = COPY %0.sub_ymm
+# ALL-NEXT: %2 = VEXTRACTF64x4Zrr %0, 1
+# ALL-NEXT: %xmm0 = COPY %1
+# ALL-NEXT: RET 0, implicit %ymm0
+body: |
+ bb.1 (%ir-block.0):
+
+ %0(<16 x s32>) = IMPLICIT_DEF
+ %1(<8 x s32>), %2(<8 x s32>) = G_UNMERGE_VALUES %0(<16 x s32>)
+ %xmm0 = COPY %1(<8 x s32>)
+ RET 0, implicit %ymm0
+
+...
+
diff --git a/test/CodeGen/X86/GlobalISel/x86_64-fallback.ll b/test/CodeGen/X86/GlobalISel/x86_64-fallback.ll
new file mode 100644
index 000000000000..2743f882b2e4
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/x86_64-fallback.ll
@@ -0,0 +1,18 @@
+; RUN: llc -O0 -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o %t.out 2> %t.err
+; RUN: FileCheck %s --check-prefix=FALLBACK-WITH-REPORT-OUT < %t.out
+; RUN: FileCheck %s --check-prefix=FALLBACK-WITH-REPORT-ERR < %t.err
+; This file checks that the fallback path to selection dag works.
+; The test is fragile in the sense that it must be updated to expose
+; something that fails with global-isel.
+; When we cannot produce a test case anymore, that means we can remove
+; the fallback path.
+
+; Check that we fallback on invoke translation failures.
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg1<def>(s80) = G_FCONSTANT x86_fp80 0xK4002A000000000000000
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for test_x86_fp80_dump
+; FALLBACK-WITH-REPORT-OUT-LABEL: test_x86_fp80_dump:
+define void @test_x86_fp80_dump(x86_fp80* %ptr){
+ store x86_fp80 0xK4002A000000000000000, x86_fp80* %ptr, align 16
+ ret void
+}
+
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index e5f7cc5c6dd8..640b5215afe9 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -2624,7 +2624,8 @@ define void @avg_v64i8_const(<64 x i8>* %a) {
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8]
+; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3
; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2
; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1
@@ -2941,7 +2942,8 @@ define void @avg_v32i16_const(<32 x i16>* %a) {
; AVX512F: # BB#0:
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8]
+; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1
; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0
diff --git a/test/CodeGen/X86/avx-cmp.ll b/test/CodeGen/X86/avx-cmp.ll
index a050d6abe56f..963878b0f563 100644
--- a/test/CodeGen/X86/avx-cmp.ll
+++ b/test/CodeGen/X86/avx-cmp.ll
@@ -1,25 +1,59 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; CHECK: vcmpltps %ymm
-; CHECK-NOT: vucomiss
-define <8 x i32> @cmp00(<8 x float> %a, <8 x float> %b) nounwind readnone {
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
+
+define <8 x i32> @cmp00(<8 x float> %a, <8 x float> %b) nounwind {
+; CHECK-LABEL: cmp00:
+; CHECK: # BB#0:
+; CHECK-NEXT: vcmpltps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bincmp = fcmp olt <8 x float> %a, %b
%s = sext <8 x i1> %bincmp to <8 x i32>
ret <8 x i32> %s
}
-; CHECK: vcmpltpd %ymm
-; CHECK-NOT: vucomisd
-define <4 x i64> @cmp01(<4 x double> %a, <4 x double> %b) nounwind readnone {
+define <4 x i64> @cmp01(<4 x double> %a, <4 x double> %b) nounwind {
+; CHECK-LABEL: cmp01:
+; CHECK: # BB#0:
+; CHECK-NEXT: vcmpltpd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bincmp = fcmp olt <4 x double> %a, %b
%s = sext <4 x i1> %bincmp to <4 x i64>
ret <4 x i64> %s
}
-declare void @scale() nounwind uwtable
-
-; CHECK: vucomisd
-define void @render() nounwind uwtable {
+declare void @scale() nounwind
+
+define void @render() nounwind {
+; CHECK-LABEL: render:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: jne .LBB2_6
+; CHECK-NEXT: # BB#1: # %for.cond5.preheader
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: jmp .LBB2_2
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB2_5: # %if.then
+; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT: callq scale
+; CHECK-NEXT: .LBB2_2: # %for.cond5
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: testb %bl, %bl
+; CHECK-NEXT: jne .LBB2_2
+; CHECK-NEXT: # BB#3: # %for.cond5
+; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT: testb %bl, %bl
+; CHECK-NEXT: je .LBB2_2
+; CHECK-NEXT: # BB#4: # %for.body33
+; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT: vucomisd {{\.LCPI.*}}, %xmm0
+; CHECK-NEXT: jne .LBB2_5
+; CHECK-NEXT: jp .LBB2_5
+; CHECK-NEXT: jmp .LBB2_2
+; CHECK-NEXT: .LBB2_6: # %for.end52
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: retq
entry:
br i1 undef, label %for.cond5, label %for.end52
@@ -42,89 +76,113 @@ for.end52:
ret void
}
-; CHECK: vextractf128 $1
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vpcmpgtd %xmm
-; CHECK-NEXT: vpcmpgtd %xmm
-; CHECK-NEXT: vinsertf128 $1
-define <8 x i32> @int256-cmp(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+define <8 x i32> @int256_cmp(<8 x i32> %i, <8 x i32> %j) nounwind {
+; CHECK-LABEL: int256_cmp:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm3
+; CHECK-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bincmp = icmp slt <8 x i32> %i, %j
%x = sext <8 x i1> %bincmp to <8 x i32>
ret <8 x i32> %x
}
-; CHECK: vextractf128 $1
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vpcmpgtq %xmm
-; CHECK-NEXT: vpcmpgtq %xmm
-; CHECK-NEXT: vinsertf128 $1
-define <4 x i64> @v4i64-cmp(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+define <4 x i64> @v4i64_cmp(<4 x i64> %i, <4 x i64> %j) nounwind {
+; CHECK-LABEL: v4i64_cmp:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm3
+; CHECK-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bincmp = icmp slt <4 x i64> %i, %j
%x = sext <4 x i1> %bincmp to <4 x i64>
ret <4 x i64> %x
}
-; CHECK: vextractf128 $1
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vpcmpgtw %xmm
-; CHECK-NEXT: vpcmpgtw %xmm
-; CHECK-NEXT: vinsertf128 $1
-define <16 x i16> @v16i16-cmp(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+define <16 x i16> @v16i16_cmp(<16 x i16> %i, <16 x i16> %j) nounwind {
+; CHECK-LABEL: v16i16_cmp:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm3
+; CHECK-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bincmp = icmp slt <16 x i16> %i, %j
%x = sext <16 x i1> %bincmp to <16 x i16>
ret <16 x i16> %x
}
-; CHECK: vextractf128 $1
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vpcmpgtb %xmm
-; CHECK-NEXT: vpcmpgtb %xmm
-; CHECK-NEXT: vinsertf128 $1
-define <32 x i8> @v32i8-cmp(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+define <32 x i8> @v32i8_cmp(<32 x i8> %i, <32 x i8> %j) nounwind {
+; CHECK-LABEL: v32i8_cmp:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm3
+; CHECK-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bincmp = icmp slt <32 x i8> %i, %j
%x = sext <32 x i1> %bincmp to <32 x i8>
ret <32 x i8> %x
}
-; CHECK: vextractf128 $1
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vpcmpeqd %xmm
-; CHECK-NEXT: vpcmpeqd %xmm
-; CHECK-NEXT: vinsertf128 $1
-define <8 x i32> @int256-cmpeq(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
+define <8 x i32> @int256_cmpeq(<8 x i32> %i, <8 x i32> %j) nounwind {
+; CHECK-LABEL: int256_cmpeq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bincmp = icmp eq <8 x i32> %i, %j
%x = sext <8 x i1> %bincmp to <8 x i32>
ret <8 x i32> %x
}
-; CHECK: vextractf128 $1
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vpcmpeqq %xmm
-; CHECK-NEXT: vpcmpeqq %xmm
-; CHECK-NEXT: vinsertf128 $1
-define <4 x i64> @v4i64-cmpeq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+define <4 x i64> @v4i64_cmpeq(<4 x i64> %i, <4 x i64> %j) nounwind {
+; CHECK-LABEL: v4i64_cmpeq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpcmpeqq %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bincmp = icmp eq <4 x i64> %i, %j
%x = sext <4 x i1> %bincmp to <4 x i64>
ret <4 x i64> %x
}
-; CHECK: vextractf128 $1
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vpcmpeqw %xmm
-; CHECK-NEXT: vpcmpeqw %xmm
-; CHECK-NEXT: vinsertf128 $1
-define <16 x i16> @v16i16-cmpeq(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
+define <16 x i16> @v16i16_cmpeq(<16 x i16> %i, <16 x i16> %j) nounwind {
+; CHECK-LABEL: v16i16_cmpeq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bincmp = icmp eq <16 x i16> %i, %j
%x = sext <16 x i1> %bincmp to <16 x i16>
ret <16 x i16> %x
}
-; CHECK: vextractf128 $1
-; CHECK: vextractf128 $1
-; CHECK-NEXT: vpcmpeqb %xmm
-; CHECK-NEXT: vpcmpeqb %xmm
-; CHECK-NEXT: vinsertf128 $1
-define <32 x i8> @v32i8-cmpeq(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+define <32 x i8> @v32i8_cmpeq(<32 x i8> %i, <32 x i8> %j) nounwind {
+; CHECK-LABEL: v32i8_cmpeq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bincmp = icmp eq <32 x i8> %i, %j
%x = sext <32 x i1> %bincmp to <32 x i8>
ret <32 x i8> %x
@@ -132,17 +190,28 @@ define <32 x i8> @v32i8-cmpeq(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
;; Scalar comparison
-; CHECK: scalarcmpA
-; CHECK: vcmpeqsd
define i32 @scalarcmpA() uwtable ssp {
+; CHECK-LABEL: scalarcmpA:
+; CHECK: # BB#0:
+; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcmpeqsd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovq %xmm0, %rax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; CHECK-NEXT: retq
%cmp29 = fcmp oeq double undef, 0.000000e+00
%res = zext i1 %cmp29 to i32
ret i32 %res
}
-; CHECK: scalarcmpB
-; CHECK: vcmpeqss
define i32 @scalarcmpB() uwtable ssp {
+; CHECK-LABEL: scalarcmpB:
+; CHECK: # BB#0:
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcmpeqss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %xmm0, %eax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: retq
%cmp29 = fcmp oeq float undef, 0.000000e+00
%res = zext i1 %cmp29 to i32
ret i32 %res
diff --git a/test/CodeGen/X86/avx-load-store.ll b/test/CodeGen/X86/avx-load-store.ll
index d7eceb7cce66..06aadc476e4c 100644
--- a/test/CodeGen/X86/avx-load-store.ll
+++ b/test/CodeGen/X86/avx-load-store.ll
@@ -1,13 +1,62 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-; RUN: llc -O0 < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s -check-prefix=CHECK_O0
-
-; CHECK: vmovaps
-; CHECK: vmovaps
-; CHECK: vmovaps
-; CHECK: vmovaps
-; CHECK: vmovaps
-; CHECK: vmovaps
-define void @test_256_load(double* nocapture %d, float* nocapture %f, <4 x i64>* nocapture %i) nounwind uwtable ssp {
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,slow-unaligned-mem-32 | FileCheck %s
+; RUN: llc -O0 < %s -mtriple=x86_64-unknown-unknown -mattr=avx,slow-unaligned-mem-32 | FileCheck %s -check-prefix=CHECK_O0
+
+define void @test_256_load(double* nocapture %d, float* nocapture %f, <4 x i64>* nocapture %i) nounwind {
+; CHECK-LABEL: test_256_load:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: subq $96, %rsp
+; CHECK-NEXT: movq %rdx, %r14
+; CHECK-NEXT: movq %rsi, %r15
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: vmovaps (%rbx), %ymm0
+; CHECK-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT: vmovaps (%r15), %ymm1
+; CHECK-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT: vmovaps (%r14), %ymm2
+; CHECK-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
+; CHECK-NEXT: callq dummy
+; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vmovaps %ymm0, (%rbx)
+; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vmovaps %ymm0, (%r15)
+; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vmovaps %ymm0, (%r14)
+; CHECK-NEXT: addq $96, %rsp
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+;
+; CHECK_O0-LABEL: test_256_load:
+; CHECK_O0: # BB#0: # %entry
+; CHECK_O0-NEXT: subq $152, %rsp
+; CHECK_O0-NEXT: vmovapd (%rdi), %ymm0
+; CHECK_O0-NEXT: vmovaps (%rsi), %ymm1
+; CHECK_O0-NEXT: vmovdqa (%rdx), %ymm2
+; CHECK_O0-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK_O0-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK_O0-NEXT: vmovups %ymm2, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK_O0-NEXT: movq %rsi, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK_O0-NEXT: movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK_O0-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK_O0-NEXT: callq dummy
+; CHECK_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdx # 8-byte Reload
+; CHECK_O0-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; CHECK_O0-NEXT: vmovapd %ymm0, (%rdx)
+; CHECK_O0-NEXT: movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
+; CHECK_O0-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 # 32-byte Reload
+; CHECK_O0-NEXT: vmovaps %ymm1, (%rsi)
+; CHECK_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; CHECK_O0-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm2 # 32-byte Reload
+; CHECK_O0-NEXT: vmovdqa %ymm2, (%rdi)
+; CHECK_O0-NEXT: addq $152, %rsp
+; CHECK_O0-NEXT: vzeroupper
+; CHECK_O0-NEXT: retq
entry:
%0 = bitcast double* %d to <4 x double>*
%tmp1.i = load <4 x double>, <4 x double>* %0, align 32
@@ -27,62 +76,115 @@ declare void @dummy(<4 x double>, <8 x float>, <4 x i64>)
;; The two tests below check that we must fold load + scalar_to_vector
;; + ins_subvec+ zext into only a single vmovss or vmovsd or vinsertps from memory
-; CHECK: mov00
define <8 x float> @mov00(<8 x float> %v, float * %ptr) nounwind {
+; CHECK-LABEL: mov00:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retq
+;
+; CHECK_O0-LABEL: mov00:
+; CHECK_O0: # BB#0:
+; CHECK_O0-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK_O0-NEXT: # implicit-def: %YMM1
+; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1
+; CHECK_O0-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; CHECK_O0-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm2[1,2,3,4,5,6,7]
+; CHECK_O0-NEXT: retq
%val = load float, float* %ptr
-; CHECK: vmovss (%
%i0 = insertelement <8 x float> zeroinitializer, float %val, i32 0
ret <8 x float> %i0
-; CHECK: ret
}
-; CHECK: mov01
define <4 x double> @mov01(<4 x double> %v, double * %ptr) nounwind {
+; CHECK-LABEL: mov01:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: retq
+;
+; CHECK_O0-LABEL: mov01:
+; CHECK_O0: # BB#0:
+; CHECK_O0-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK_O0-NEXT: # implicit-def: %YMM1
+; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1
+; CHECK_O0-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; CHECK_O0-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm2[1,2,3]
+; CHECK_O0-NEXT: retq
%val = load double, double* %ptr
-; CHECK: vmovsd (%
%i0 = insertelement <4 x double> zeroinitializer, double %val, i32 0
ret <4 x double> %i0
-; CHECK: ret
}
-; CHECK: vmovaps %ymm
define void @storev16i16(<16 x i16> %a) nounwind {
+; CHECK-LABEL: storev16i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %ymm0, (%rax)
+;
+; CHECK_O0-LABEL: storev16i16:
+; CHECK_O0: # BB#0:
+; CHECK_O0-NEXT: # implicit-def: %RAX
+; CHECK_O0-NEXT: vmovdqa %ymm0, (%rax)
store <16 x i16> %a, <16 x i16>* undef, align 32
unreachable
}
-; CHECK: storev16i16_01
-; CHECK: vextractf128
-; CHECK: vmovups %xmm
define void @storev16i16_01(<16 x i16> %a) nounwind {
+; CHECK-LABEL: storev16i16_01:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, (%rax)
+; CHECK-NEXT: vmovups %xmm0, (%rax)
+;
+; CHECK_O0-LABEL: storev16i16_01:
+; CHECK_O0: # BB#0:
+; CHECK_O0-NEXT: # implicit-def: %RAX
+; CHECK_O0-NEXT: vmovdqu %ymm0, (%rax)
store <16 x i16> %a, <16 x i16>* undef, align 4
unreachable
}
-; CHECK: storev32i8
-; CHECK: vmovaps %ymm
define void @storev32i8(<32 x i8> %a) nounwind {
+; CHECK-LABEL: storev32i8:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %ymm0, (%rax)
+;
+; CHECK_O0-LABEL: storev32i8:
+; CHECK_O0: # BB#0:
+; CHECK_O0-NEXT: # implicit-def: %RAX
+; CHECK_O0-NEXT: vmovdqa %ymm0, (%rax)
store <32 x i8> %a, <32 x i8>* undef, align 32
unreachable
}
-; CHECK: storev32i8_01
-; CHECK: vextractf128
-; CHECK: vmovups %xmm
define void @storev32i8_01(<32 x i8> %a) nounwind {
+; CHECK-LABEL: storev32i8_01:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, (%rax)
+; CHECK-NEXT: vmovups %xmm0, (%rax)
+;
+; CHECK_O0-LABEL: storev32i8_01:
+; CHECK_O0: # BB#0:
+; CHECK_O0-NEXT: # implicit-def: %RAX
+; CHECK_O0-NEXT: vmovdqu %ymm0, (%rax)
store <32 x i8> %a, <32 x i8>* undef, align 4
unreachable
}
; It is faster to make two saves, if the data is already in XMM registers. For
; example, after making an integer operation.
-; CHECK: _double_save
-; CHECK-NOT: vinsertf128 $1
-; CHECK-NOT: vinsertf128 $0
-; CHECK: vmovaps %xmm
-; CHECK: vmovaps %xmm
define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp {
-entry:
+; CHECK-LABEL: double_save:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %xmm1, 16(%rdi)
+; CHECK-NEXT: vmovaps %xmm0, (%rdi)
+; CHECK-NEXT: retq
+;
+; CHECK_O0-LABEL: double_save:
+; CHECK_O0: # BB#0:
+; CHECK_O0-NEXT: # implicit-def: %YMM2
+; CHECK_O0-NEXT: vmovaps %xmm0, %xmm2
+; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2
+; CHECK_O0-NEXT: vmovdqu %ymm2, (%rdi)
+; CHECK_O0-NEXT: vzeroupper
+; CHECK_O0-NEXT: retq
%Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
store <8 x i32> %Z, <8 x i32>* %P, align 16
ret void
@@ -90,60 +192,127 @@ entry:
declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind
-; CHECK_O0: _f_f
-; CHECK-O0: vmovss LCPI
-; CHECK-O0: vxorps %xmm
-; CHECK-O0: vmovss %xmm
define void @f_f() nounwind {
+; CHECK-LABEL: f_f:
+; CHECK: # BB#0: # %allocas
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: jne .LBB8_2
+; CHECK-NEXT: # BB#1: # %cif_mask_all
+; CHECK-NEXT: .LBB8_2: # %cif_mask_mixed
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: jne .LBB8_4
+; CHECK-NEXT: # BB#3: # %cif_mixed_test_all
+; CHECK-NEXT: movl $-1, %eax
+; CHECK-NEXT: vmovd %eax, %xmm0
+; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax)
+; CHECK-NEXT: .LBB8_4: # %cif_mixed_test_any_check
+;
+; CHECK_O0-LABEL: f_f:
+; CHECK_O0: # BB#0: # %allocas
+; CHECK_O0-NEXT: # implicit-def: %AL
+; CHECK_O0-NEXT: testb $1, %al
+; CHECK_O0-NEXT: jne .LBB8_1
+; CHECK_O0-NEXT: jmp .LBB8_2
+; CHECK_O0-NEXT: .LBB8_1: # %cif_mask_all
+; CHECK_O0-NEXT: .LBB8_2: # %cif_mask_mixed
+; CHECK_O0-NEXT: # implicit-def: %AL
+; CHECK_O0-NEXT: testb $1, %al
+; CHECK_O0-NEXT: jne .LBB8_3
+; CHECK_O0-NEXT: jmp .LBB8_4
+; CHECK_O0-NEXT: .LBB8_3: # %cif_mixed_test_all
+; CHECK_O0-NEXT: movl $-1, %eax
+; CHECK_O0-NEXT: vmovd %eax, %xmm0
+; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1
+; CHECK_O0-NEXT: # implicit-def: %RCX
+; CHECK_O0-NEXT: # implicit-def: %YMM2
+; CHECK_O0-NEXT: vmaskmovps %ymm2, %ymm1, (%rcx)
+; CHECK_O0-NEXT: .LBB8_4: # %cif_mixed_test_any_check
allocas:
br i1 undef, label %cif_mask_all, label %cif_mask_mixed
-cif_mask_all: ; preds = %allocas
+cif_mask_all:
unreachable
-cif_mask_mixed: ; preds = %allocas
+cif_mask_mixed:
br i1 undef, label %cif_mixed_test_all, label %cif_mixed_test_any_check
-cif_mixed_test_all: ; preds = %cif_mask_mixed
+cif_mixed_test_all:
call void @llvm.x86.avx.maskstore.ps.256(i8* undef, <8 x i32> <i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <8 x float> undef) nounwind
unreachable
-cif_mixed_test_any_check: ; preds = %cif_mask_mixed
+cif_mixed_test_any_check:
unreachable
}
-; CHECK: add8i32
-; CHECK: vmovups
-; CHECK: vmovups
-; CHECK-NOT: vinsertf128
-; CHECK-NOT: vextractf128
-; CHECK: vmovups
-; CHECK: vmovups
define void @add8i32(<8 x i32>* %ret, <8 x i32>* %bp) nounwind {
+; CHECK-LABEL: add8i32:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovups (%rsi), %xmm0
+; CHECK-NEXT: vmovups 16(%rsi), %xmm1
+; CHECK-NEXT: vmovups %xmm1, 16(%rdi)
+; CHECK-NEXT: vmovups %xmm0, (%rdi)
+; CHECK-NEXT: retq
+;
+; CHECK_O0-LABEL: add8i32:
+; CHECK_O0: # BB#0:
+; CHECK_O0-NEXT: vmovdqu (%rsi), %xmm0
+; CHECK_O0-NEXT: vmovdqu 16(%rsi), %xmm1
+; CHECK_O0-NEXT: # implicit-def: %YMM2
+; CHECK_O0-NEXT: vmovaps %xmm0, %xmm2
+; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2
+; CHECK_O0-NEXT: vmovdqu %ymm2, (%rdi)
+; CHECK_O0-NEXT: vzeroupper
+; CHECK_O0-NEXT: retq
%b = load <8 x i32>, <8 x i32>* %bp, align 1
%x = add <8 x i32> zeroinitializer, %b
store <8 x i32> %x, <8 x i32>* %ret, align 1
ret void
}
-; CHECK: add4i64a64
-; CHECK: vmovaps ({{.*}}), %ymm{{.*}}
-; CHECK: vmovaps %ymm{{.*}}, ({{.*}})
define void @add4i64a64(<4 x i64>* %ret, <4 x i64>* %bp) nounwind {
+; CHECK-LABEL: add4i64a64:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps (%rsi), %ymm0
+; CHECK-NEXT: vmovaps %ymm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+;
+; CHECK_O0-LABEL: add4i64a64:
+; CHECK_O0: # BB#0:
+; CHECK_O0-NEXT: vmovaps (%rsi), %ymm0
+; CHECK_O0-NEXT: vmovdqa %ymm0, (%rdi)
+; CHECK_O0-NEXT: vzeroupper
+; CHECK_O0-NEXT: retq
%b = load <4 x i64>, <4 x i64>* %bp, align 64
%x = add <4 x i64> zeroinitializer, %b
store <4 x i64> %x, <4 x i64>* %ret, align 64
ret void
}
-; CHECK: add4i64a16
-; CHECK: vmovaps {{.*}}({{.*}}), %xmm{{.*}}
-; CHECK: vmovaps {{.*}}({{.*}}), %xmm{{.*}}
-; CHECK: vmovaps %xmm{{.*}}, {{.*}}({{.*}})
-; CHECK: vmovaps %xmm{{.*}}, {{.*}}({{.*}})
define void @add4i64a16(<4 x i64>* %ret, <4 x i64>* %bp) nounwind {
+; CHECK-LABEL: add4i64a16:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps (%rsi), %xmm0
+; CHECK-NEXT: vmovaps 16(%rsi), %xmm1
+; CHECK-NEXT: vmovaps %xmm1, 16(%rdi)
+; CHECK-NEXT: vmovaps %xmm0, (%rdi)
+; CHECK-NEXT: retq
+;
+; CHECK_O0-LABEL: add4i64a16:
+; CHECK_O0: # BB#0:
+; CHECK_O0-NEXT: vmovdqa (%rsi), %xmm0
+; CHECK_O0-NEXT: vmovdqa 16(%rsi), %xmm1
+; CHECK_O0-NEXT: # implicit-def: %YMM2
+; CHECK_O0-NEXT: vmovaps %xmm0, %xmm2
+; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2
+; CHECK_O0-NEXT: vmovdqu %ymm2, (%rdi)
+; CHECK_O0-NEXT: vzeroupper
+; CHECK_O0-NEXT: retq
%b = load <4 x i64>, <4 x i64>* %bp, align 16
%x = add <4 x i64> zeroinitializer, %b
store <4 x i64> %x, <4 x i64>* %ret, align 16
ret void
}
+
diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll
index 47e95fe31bdf..a12a412fb94d 100644
--- a/test/CodeGen/X86/avx-schedule.ll
+++ b/test/CodeGen/X86/avx-schedule.ll
@@ -10,8 +10,8 @@ define <4 x double> @test_addpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; SANDY-LABEL: test_addpd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addpd:
; HASWELL: # BB#0:
@@ -21,14 +21,14 @@ define <4 x double> @test_addpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
;
; BTVER2-LABEL: test_addpd:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addpd:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; ZNVER1-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = fadd <4 x double> %a0, %a1
%2 = load <4 x double>, <4 x double> *%a2, align 32
@@ -40,8 +40,8 @@ define <8 x float> @test_addps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; SANDY-LABEL: test_addps:
; SANDY: # BB#0:
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addps:
; HASWELL: # BB#0:
@@ -51,14 +51,14 @@ define <8 x float> @test_addps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
;
; BTVER2-LABEL: test_addps:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addps:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; ZNVER1-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = fadd <8 x float> %a0, %a1
%2 = load <8 x float>, <8 x float> *%a2, align 32
@@ -70,8 +70,8 @@ define <4 x double> @test_addsubpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
; SANDY-LABEL: test_addsubpd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addsubpd:
; HASWELL: # BB#0:
@@ -81,14 +81,14 @@ define <4 x double> @test_addsubpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
;
; BTVER2-LABEL: test_addsubpd:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addsubpd:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; ZNVER1-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
%2 = load <4 x double>, <4 x double> *%a2, align 32
@@ -101,8 +101,8 @@ define <8 x float> @test_addsubps(<8 x float> %a0, <8 x float> %a1, <8 x float>
; SANDY-LABEL: test_addsubps:
; SANDY: # BB#0:
; SANDY-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addsubps:
; HASWELL: # BB#0:
@@ -112,14 +112,14 @@ define <8 x float> @test_addsubps(<8 x float> %a0, <8 x float> %a1, <8 x float>
;
; BTVER2-LABEL: test_addsubps:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addsubps:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; ZNVER1-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
%2 = load <8 x float>, <8 x float> *%a2, align 32
@@ -131,10 +131,10 @@ declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwi
define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
; SANDY-LABEL: test_andnotpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andnotpd:
; HASWELL: # BB#0:
@@ -147,14 +147,14 @@ define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
; BTVER2: # BB#0:
; BTVER2-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andnotpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = bitcast <4 x double> %a0 to <4 x i64>
%2 = bitcast <4 x double> %a1 to <4 x i64>
@@ -172,10 +172,10 @@ define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_andnotps:
; SANDY: # BB#0:
-; SANDY-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andnotps:
; HASWELL: # BB#0:
@@ -188,14 +188,14 @@ define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float>
; BTVER2: # BB#0:
; BTVER2-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andnotps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = bitcast <8 x float> %a0 to <4 x i64>
%2 = bitcast <8 x float> %a1 to <4 x i64>
@@ -213,10 +213,10 @@ define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float>
define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
; SANDY-LABEL: test_andpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andpd:
; HASWELL: # BB#0:
@@ -229,14 +229,14 @@ define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; BTVER2: # BB#0:
; BTVER2-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = bitcast <4 x double> %a0 to <4 x i64>
%2 = bitcast <4 x double> %a1 to <4 x i64>
@@ -252,10 +252,10 @@ define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_andps:
; SANDY: # BB#0:
-; SANDY-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andps:
; HASWELL: # BB#0:
@@ -268,14 +268,14 @@ define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; BTVER2: # BB#0:
; BTVER2-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = bitcast <8 x float> %a0 to <4 x i64>
%2 = bitcast <8 x float> %a1 to <4 x i64>
@@ -291,10 +291,10 @@ define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
; SANDY-LABEL: test_blendpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50]
+; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:1.00]
; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendpd:
; HASWELL: # BB#0:
@@ -306,14 +306,14 @@ define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x doubl
; BTVER2-LABEL: test_blendpd:
; BTVER2: # BB#0:
; BTVER2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50]
-; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blendpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50]
-; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [6:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
@@ -326,9 +326,9 @@ define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x doubl
define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_blendps:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50]
-; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:1.00]
+; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendps:
; HASWELL: # BB#0:
@@ -356,9 +356,9 @@ define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> *
define <4 x double> @test_blendvpd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) {
; SANDY-LABEL: test_blendvpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; SANDY-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; SANDY-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendvpd:
; HASWELL: # BB#0:
@@ -387,9 +387,9 @@ declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4
define <8 x float> @test_blendvps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) {
; SANDY-LABEL: test_blendvps:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; SANDY-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; SANDY-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendvps:
; HASWELL: # BB#0:
@@ -418,8 +418,8 @@ declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x f
define <8 x float> @test_broadcastf128(<4 x float> *%a0) {
; SANDY-LABEL: test_broadcastf128:
; SANDY: # BB#0:
-; SANDY-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_broadcastf128:
; HASWELL: # BB#0:
@@ -443,8 +443,8 @@ define <8 x float> @test_broadcastf128(<4 x float> *%a0) {
define <4 x double> @test_broadcastsd_ymm(double *%a0) {
; SANDY-LABEL: test_broadcastsd_ymm:
; SANDY: # BB#0:
-; SANDY-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_broadcastsd_ymm:
; HASWELL: # BB#0:
@@ -469,8 +469,8 @@ define <4 x double> @test_broadcastsd_ymm(double *%a0) {
define <4 x float> @test_broadcastss(float *%a0) {
; SANDY-LABEL: test_broadcastss:
; SANDY: # BB#0:
-; SANDY-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_broadcastss:
; HASWELL: # BB#0:
@@ -496,7 +496,7 @@ define <8 x float> @test_broadcastss_ymm(float *%a0) {
; SANDY-LABEL: test_broadcastss_ymm:
; SANDY: # BB#0:
; SANDY-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_broadcastss_ymm:
; HASWELL: # BB#0:
@@ -522,9 +522,9 @@ define <4 x double> @test_cmppd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; SANDY-LABEL: test_cmppd:
; SANDY: # BB#0:
; SANDY-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmppd:
; HASWELL: # BB#0:
@@ -560,9 +560,9 @@ define <8 x float> @test_cmpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; SANDY-LABEL: test_cmpps:
; SANDY: # BB#0:
; SANDY-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmpps:
; HASWELL: # BB#0:
@@ -598,9 +598,9 @@ define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
; SANDY-LABEL: test_cvtdq2pd:
; SANDY: # BB#0:
; SANDY-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [10:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtdq2pd:
; HASWELL: # BB#0:
@@ -613,14 +613,14 @@ define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
; BTVER2-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtdq2pd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
; ZNVER1-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = sitofp <4 x i32> %a0 to <4 x double>
%2 = load <4 x i32>, <4 x i32> *%a1, align 16
@@ -632,12 +632,12 @@ define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) {
; SANDY-LABEL: test_cvtdq2ps:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:1.00]
-; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [4:0.50]
-; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm1, %ymm1 # sched: [5:1.00]
-; SANDY-NEXT: vcvtdq2ps %ymm1, %ymm1 # sched: [4:1.00]
+; SANDY-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [6:0.50]
+; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm1, %ymm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvtdq2ps %ymm1, %ymm1 # sched: [3:1.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtdq2ps:
; HASWELL: # BB#0:
@@ -650,14 +650,14 @@ define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00]
; BTVER2-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtdq2ps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00]
; ZNVER1-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = sitofp <8 x i32> %a0 to <8 x float>
%2 = load <8 x i32>, <8 x i32> *%a1, align 16
@@ -669,10 +669,10 @@ define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) {
define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-LABEL: test_cvtpd2dq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtpd2dq:
; HASWELL: # BB#0:
@@ -704,10 +704,10 @@ define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) {
define <8 x float> @test_cvtpd2ps(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-LABEL: test_cvtpd2ps:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtpd2ps:
; HASWELL: # BB#0:
@@ -741,8 +741,8 @@ define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) {
; SANDY: # BB#0:
; SANDY-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [7:1.00]
-; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtps2dq:
; HASWELL: # BB#0:
@@ -774,9 +774,9 @@ define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) {
define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
; SANDY-LABEL: test_divpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [45:3.00]
+; SANDY-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [52:3.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divpd:
; HASWELL: # BB#0:
@@ -786,14 +786,14 @@ define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
;
; BTVER2-LABEL: test_divpd:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [19:19.00]
-; BTVER2-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [24:19.00]
+; BTVER2-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [38:38.00]
+; BTVER2-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [43:38.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_divpd:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [19:19.00]
-; ZNVER1-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [24:19.00]
+; ZNVER1-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [38:38.00]
+; ZNVER1-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [43:38.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = fdiv <4 x double> %a0, %a1
%2 = load <4 x double>, <4 x double> *%a2, align 32
@@ -804,9 +804,9 @@ define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_divps:
; SANDY: # BB#0:
-; SANDY-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [29:3.00]
+; SANDY-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [36:3.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divps:
; HASWELL: # BB#0:
@@ -816,14 +816,14 @@ define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
;
; BTVER2-LABEL: test_divps:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [19:19.00]
-; BTVER2-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [24:19.00]
+; BTVER2-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [38:38.00]
+; BTVER2-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [43:38.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_divps:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [19:19.00]
-; ZNVER1-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [24:19.00]
+; ZNVER1-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [38:38.00]
+; ZNVER1-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [43:38.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = fdiv <8 x float> %a0, %a1
%2 = load <8 x float>, <8 x float> *%a2, align 32
@@ -834,9 +834,9 @@ define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
define <8 x float> @test_dpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_dpps:
; SANDY: # BB#0:
-; SANDY-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [12:2.00]
; SANDY-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_dpps:
; HASWELL: # BB#0:
@@ -866,9 +866,9 @@ define <4 x float> @test_extractf128(<8 x float> %a0, <8 x float> %a1, <4 x floa
; SANDY-LABEL: test_extractf128:
; SANDY: # BB#0:
; SANDY-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
+; SANDY-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [5:1.00]
; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_extractf128:
; HASWELL: # BB#0:
@@ -900,7 +900,7 @@ define <4 x double> @test_haddpd(<4 x double> %a0, <4 x double> %a1, <4 x double
; SANDY: # BB#0:
; SANDY-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_haddpd:
; HASWELL: # BB#0:
@@ -929,9 +929,9 @@ declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounw
define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_haddps:
; SANDY: # BB#0:
-; SANDY-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; SANDY-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_haddps:
; HASWELL: # BB#0:
@@ -960,9 +960,9 @@ declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind
define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
; SANDY-LABEL: test_hsubpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; SANDY-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_hsubpd:
; HASWELL: # BB#0:
@@ -991,9 +991,9 @@ declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounw
define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_hsubps:
; SANDY: # BB#0:
-; SANDY-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; SANDY-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_hsubps:
; HASWELL: # BB#0:
@@ -1023,9 +1023,9 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float
; SANDY-LABEL: test_insertf128:
; SANDY: # BB#0:
; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:1.00]
-; SANDY-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_insertf128:
; HASWELL: # BB#0:
@@ -1038,14 +1038,14 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float
; BTVER2: # BB#0:
; BTVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:0.50]
; BTVER2-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_insertf128:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:0.50]
; ZNVER1-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%2 = shufflevector <8 x float> %a0, <8 x float> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
@@ -1059,8 +1059,8 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float
define <32 x i8> @test_lddqu(i8* %a0) {
; SANDY-LABEL: test_lddqu:
; SANDY: # BB#0:
-; SANDY-NEXT: vlddqu (%rdi), %ymm0 # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vlddqu (%rdi), %ymm0 # sched: [6:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_lddqu:
; HASWELL: # BB#0:
@@ -1084,10 +1084,10 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readonly
define <2 x double> @test_maskmovpd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) {
; SANDY-LABEL: test_maskmovpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
-; SANDY-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
+; SANDY-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [8:2.00]
+; SANDY-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
; SANDY-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maskmovpd:
; HASWELL: # BB#0:
@@ -1119,10 +1119,10 @@ declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind
define <4 x double> @test_maskmovpd_ymm(i8* %a0, <4 x i64> %a1, <4 x double> %a2) {
; SANDY-LABEL: test_maskmovpd_ymm:
; SANDY: # BB#0:
-; SANDY-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
+; SANDY-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [5:1.00]
; SANDY-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
; SANDY-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maskmovpd_ymm:
; HASWELL: # BB#0:
@@ -1154,10 +1154,10 @@ declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwi
define <4 x float> @test_maskmovps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) {
; SANDY-LABEL: test_maskmovps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
-; SANDY-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
+; SANDY-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [8:2.00]
+; SANDY-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
; SANDY-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maskmovps:
; HASWELL: # BB#0:
@@ -1189,10 +1189,10 @@ declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind
define <8 x float> @test_maskmovps_ymm(i8* %a0, <8 x i32> %a1, <8 x float> %a2) {
; SANDY-LABEL: test_maskmovps_ymm:
; SANDY: # BB#0:
-; SANDY-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
+; SANDY-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [1:0.50]
; SANDY-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
; SANDY-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maskmovps_ymm:
; HASWELL: # BB#0:
@@ -1225,8 +1225,8 @@ define <4 x double> @test_maxpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; SANDY-LABEL: test_maxpd:
; SANDY: # BB#0:
; SANDY-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxpd:
; HASWELL: # BB#0:
@@ -1256,8 +1256,8 @@ define <8 x float> @test_maxps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; SANDY-LABEL: test_maxps:
; SANDY: # BB#0:
; SANDY-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxps:
; HASWELL: # BB#0:
@@ -1288,7 +1288,7 @@ define <4 x double> @test_minpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; SANDY: # BB#0:
; SANDY-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minpd:
; HASWELL: # BB#0:
@@ -1319,7 +1319,7 @@ define <8 x float> @test_minps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; SANDY: # BB#0:
; SANDY-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minps:
; HASWELL: # BB#0:
@@ -1348,10 +1348,10 @@ declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind
define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) {
; SANDY-LABEL: test_movapd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovapd (%rdi), %ymm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovapd (%rdi), %ymm0 # sched: [7:0.50]
; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovapd %ymm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movapd:
; HASWELL: # BB#0:
@@ -1363,14 +1363,14 @@ define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) {
; BTVER2-LABEL: test_movapd:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovapd (%rdi), %ymm0 # sched: [5:1.00]
-; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movapd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovapd (%rdi), %ymm0 # sched: [5:1.00]
-; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = load <4 x double>, <4 x double> *%a0, align 32
@@ -1382,10 +1382,10 @@ define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) {
define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) {
; SANDY-LABEL: test_movaps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovaps (%rdi), %ymm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovaps (%rdi), %ymm0 # sched: [7:0.50]
; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovaps %ymm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movaps:
; HASWELL: # BB#0:
@@ -1397,14 +1397,14 @@ define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) {
; BTVER2-LABEL: test_movaps:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovaps (%rdi), %ymm0 # sched: [5:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movaps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovaps (%rdi), %ymm0 # sched: [5:1.00]
-; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = load <8 x float>, <8 x float> *%a0, align 32
@@ -1417,9 +1417,9 @@ define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-LABEL: test_movddup:
; SANDY: # BB#0:
; SANDY-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00]
-; SANDY-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [4:0.50]
+; SANDY-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [7:0.50]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movddup:
; HASWELL: # BB#0:
@@ -1432,14 +1432,14 @@ define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [5:1.00]
; BTVER2-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:0.50]
-; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movddup:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [5:1.00]
; ZNVER1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:0.50]
-; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
%2 = load <4 x double>, <4 x double> *%a1, align 32
@@ -1451,9 +1451,9 @@ define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) {
define i32 @test_movmskpd(<4 x double> %a0) {
; SANDY-LABEL: test_movmskpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.33]
+; SANDY-NEXT: vmovmskpd %ymm0, %eax # sched: [2:1.00]
; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movmskpd:
; HASWELL: # BB#0:
@@ -1479,9 +1479,9 @@ declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
define i32 @test_movmskps(<8 x float> %a0) {
; SANDY-LABEL: test_movmskps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.33]
+; SANDY-NEXT: vmovmskps %ymm0, %eax # sched: [3:1.00]
; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movmskps:
; HASWELL: # BB#0:
@@ -1508,8 +1508,8 @@ define <4 x double> @test_movntpd(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-LABEL: test_movntpd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntpd %ymm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntpd:
; HASWELL: # BB#0:
@@ -1519,13 +1519,13 @@ define <4 x double> @test_movntpd(<4 x double> %a0, <4 x double> *%a1) {
;
; BTVER2-LABEL: test_movntpd:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movntpd:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = fadd <4 x double> %a0, %a0
@@ -1537,8 +1537,8 @@ define <8 x float> @test_movntps(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-LABEL: test_movntps:
; SANDY: # BB#0:
; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntps %ymm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntps:
; HASWELL: # BB#0:
@@ -1548,13 +1548,13 @@ define <8 x float> @test_movntps(<8 x float> %a0, <8 x float> *%a1) {
;
; BTVER2-LABEL: test_movntps:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movntps:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = fadd <8 x float> %a0, %a0
@@ -1566,9 +1566,9 @@ define <8 x float> @test_movshdup(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-LABEL: test_movshdup:
; SANDY: # BB#0:
; SANDY-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00]
-; SANDY-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [4:0.50]
+; SANDY-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [7:0.50]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movshdup:
; HASWELL: # BB#0:
@@ -1581,14 +1581,14 @@ define <8 x float> @test_movshdup(<8 x float> %a0, <8 x float> *%a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [5:1.00]
; BTVER2-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:0.50]
-; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movshdup:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [5:1.00]
; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:0.50]
-; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
%2 = load <8 x float>, <8 x float> *%a1, align 32
@@ -1601,9 +1601,9 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-LABEL: test_movsldup:
; SANDY: # BB#0:
; SANDY-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00]
-; SANDY-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [4:0.50]
+; SANDY-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [7:0.50]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movsldup:
; HASWELL: # BB#0:
@@ -1616,14 +1616,14 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [5:1.00]
; BTVER2-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:0.50]
-; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movsldup:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [5:1.00]
; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:0.50]
-; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
%2 = load <8 x float>, <8 x float> *%a1, align 32
@@ -1635,12 +1635,12 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) {
define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
; SANDY-LABEL: test_movupd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [1:1.00]
-; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [5:1.00]
+; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movupd:
; HASWELL: # BB#0:
@@ -1652,14 +1652,14 @@ define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
; BTVER2-LABEL: test_movupd:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovupd (%rdi), %ymm0 # sched: [5:1.00]
-; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movupd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovupd (%rdi), %ymm0 # sched: [5:1.00]
-; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = load <4 x double>, <4 x double> *%a0, align 1
@@ -1671,12 +1671,12 @@ define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) {
; SANDY-LABEL: test_movups:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [7:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [1:1.00]
-; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [5:1.00]
+; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movups:
; HASWELL: # BB#0:
@@ -1688,14 +1688,14 @@ define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) {
; BTVER2-LABEL: test_movups:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovups (%rdi), %ymm0 # sched: [5:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movups:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovups (%rdi), %ymm0 # sched: [5:1.00]
-; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = load <8 x float>, <8 x float> *%a0, align 1
@@ -1708,8 +1708,8 @@ define <4 x double> @test_mulpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; SANDY-LABEL: test_mulpd:
; SANDY: # BB#0:
; SANDY-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulpd:
; HASWELL: # BB#0:
@@ -1719,14 +1719,14 @@ define <4 x double> @test_mulpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
;
; BTVER2-LABEL: test_mulpd:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BTVER2-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [4:4.00]
+; BTVER2-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:4.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_mulpd:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; ZNVER1-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; ZNVER1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [4:4.00]
+; ZNVER1-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:4.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = fmul <4 x double> %a0, %a1
%2 = load <4 x double>, <4 x double> *%a2, align 32
@@ -1738,8 +1738,8 @@ define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; SANDY-LABEL: test_mulps:
; SANDY: # BB#0:
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulps:
; HASWELL: # BB#0:
@@ -1749,14 +1749,14 @@ define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
;
; BTVER2-LABEL: test_mulps:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_mulps:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; ZNVER1-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; ZNVER1-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; ZNVER1-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = fmul <8 x float> %a0, %a1
%2 = load <8 x float>, <8 x float> *%a2, align 32
@@ -1767,10 +1767,10 @@ define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
; SANDY-LABEL: orpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: orpd:
; HASWELL: # BB#0:
@@ -1783,14 +1783,14 @@ define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2)
; BTVER2: # BB#0:
; BTVER2-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: orpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = bitcast <4 x double> %a0 to <4 x i64>
%2 = bitcast <4 x double> %a1 to <4 x i64>
@@ -1806,10 +1806,10 @@ define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2)
define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_orps:
; SANDY: # BB#0:
-; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_orps:
; HASWELL: # BB#0:
@@ -1822,14 +1822,14 @@ define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2
; BTVER2: # BB#0:
; BTVER2-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_orps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = bitcast <8 x float> %a0 to <4 x i64>
%2 = bitcast <8 x float> %a1 to <4 x i64>
@@ -1846,9 +1846,9 @@ define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) {
; SANDY-LABEL: test_permilpd:
; SANDY: # BB#0:
; SANDY-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00]
-; SANDY-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [5:1.00]
+; SANDY-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilpd:
; HASWELL: # BB#0:
@@ -1880,10 +1880,10 @@ define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) {
define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-LABEL: test_permilpd_ymm:
; SANDY: # BB#0:
-; SANDY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00]
+; SANDY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [8:1.00]
; SANDY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [5:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilpd_ymm:
; HASWELL: # BB#0:
@@ -1896,14 +1896,14 @@ define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [6:1.00]
; BTVER2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:0.50]
-; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_permilpd_ymm:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [6:1.00]
; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:0.50]
-; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
%2 = load <4 x double>, <4 x double> *%a1, align 32
@@ -1916,9 +1916,9 @@ define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_permilps:
; SANDY: # BB#0:
; SANDY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00]
-; SANDY-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00]
+; SANDY-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilps:
; HASWELL: # BB#0:
@@ -1950,10 +1950,10 @@ define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) {
define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-LABEL: test_permilps_ymm:
; SANDY: # BB#0:
-; SANDY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
+; SANDY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [8:1.00]
; SANDY-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilps_ymm:
; HASWELL: # BB#0:
@@ -1966,14 +1966,14 @@ define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [6:1.00]
; BTVER2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.50]
-; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_permilps_ymm:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [6:1.00]
; ZNVER1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.50]
-; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
%2 = load <8 x float>, <8 x float> *%a1, align 32
@@ -1986,8 +1986,8 @@ define <2 x double> @test_permilvarpd(<2 x double> %a0, <2 x i64> %a1, <2 x i64>
; SANDY-LABEL: test_permilvarpd:
; SANDY: # BB#0:
; SANDY-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilvarpd:
; HASWELL: # BB#0:
@@ -2018,7 +2018,7 @@ define <4 x double> @test_permilvarpd_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x
; SANDY: # BB#0:
; SANDY-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
; SANDY-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilvarpd_ymm:
; HASWELL: # BB#0:
@@ -2048,8 +2048,8 @@ define <4 x float> @test_permilvarps(<4 x float> %a0, <4 x i32> %a1, <4 x i32> *
; SANDY-LABEL: test_permilvarps:
; SANDY: # BB#0:
; SANDY-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilvarps:
; HASWELL: # BB#0:
@@ -2080,7 +2080,7 @@ define <8 x float> @test_permilvarps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i3
; SANDY: # BB#0:
; SANDY-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
; SANDY-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilvarps_ymm:
; HASWELL: # BB#0:
@@ -2112,7 +2112,7 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vrcpps (%rdi), %ymm1 # sched: [9:1.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rcpps:
; HASWELL: # BB#0:
@@ -2123,16 +2123,16 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) {
;
; BTVER2-LABEL: test_rcpps:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:1.00]
-; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:2.00]
+; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_rcpps:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:1.00]
-; ZNVER1-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:1.00]
-; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:2.00]
+; ZNVER1-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
%2 = load <8 x float>, <8 x float> *%a1, align 32
@@ -2148,7 +2148,7 @@ define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundpd:
; HASWELL: # BB#0:
@@ -2161,14 +2161,14 @@ define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [8:1.00]
; BTVER2-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_roundpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [8:1.00]
; ZNVER1-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7)
%2 = load <4 x double>, <4 x double> *%a1, align 32
@@ -2184,7 +2184,7 @@ define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [7:1.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundps:
; HASWELL: # BB#0:
@@ -2197,14 +2197,14 @@ define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [8:1.00]
; BTVER2-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_roundps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [8:1.00]
; ZNVER1-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7)
%2 = load <8 x float>, <8 x float> *%a1, align 32
@@ -2217,10 +2217,10 @@ declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readno
define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-LABEL: test_rsqrtps:
; SANDY: # BB#0:
-; SANDY-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [9:1.00]
+; SANDY-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [14:3.00]
+; SANDY-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [7:3.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rsqrtps:
; HASWELL: # BB#0:
@@ -2231,16 +2231,16 @@ define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) {
;
; BTVER2-LABEL: test_rsqrtps:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:1.00]
-; BTVER2-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:2.00]
+; BTVER2-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_rsqrtps:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:1.00]
-; ZNVER1-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:1.00]
-; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:2.00]
+; ZNVER1-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:2.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
%2 = load <8 x float>, <8 x float> *%a1, align 32
@@ -2254,9 +2254,9 @@ define <4 x double> @test_shufpd(<4 x double> %a0, <4 x double> %a1, <4 x double
; SANDY-LABEL: test_shufpd:
; SANDY: # BB#0:
; SANDY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00]
-; SANDY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [5:1.00]
+; SANDY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [8:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_shufpd:
; HASWELL: # BB#0:
@@ -2269,14 +2269,14 @@ define <4 x double> @test_shufpd(<4 x double> %a0, <4 x double> %a1, <4 x double
; BTVER2: # BB#0:
; BTVER2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:0.50]
; BTVER2-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [6:1.00]
-; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_shufpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:0.50]
; ZNVER1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [6:1.00]
-; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 4, i32 2, i32 7>
%2 = load <4 x double>, <4 x double> *%a2, align 32
@@ -2289,8 +2289,8 @@ define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
; SANDY-LABEL: test_shufps:
; SANDY: # BB#0:
; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00]
-; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_shufps:
; HASWELL: # BB#0:
@@ -2318,10 +2318,10 @@ define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-LABEL: test_sqrtpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [15:1.00]
-; SANDY-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [19:1.00]
+; SANDY-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [52:3.00]
+; SANDY-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [45:3.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtpd:
; HASWELL: # BB#0:
@@ -2332,16 +2332,16 @@ define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) {
;
; BTVER2-LABEL: test_sqrtpd:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [26:21.00]
-; BTVER2-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [21:21.00]
-; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [59:54.00]
+; BTVER2-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [54:54.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_sqrtpd:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [26:21.00]
-; ZNVER1-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [21:21.00]
-; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [59:54.00]
+; ZNVER1-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [54:54.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0)
%2 = load <4 x double>, <4 x double> *%a1, align 32
@@ -2354,10 +2354,10 @@ declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-LABEL: test_sqrtps:
; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtps %ymm0, %ymm0 # sched: [15:1.00]
-; SANDY-NEXT: vsqrtps (%rdi), %ymm1 # sched: [19:1.00]
+; SANDY-NEXT: vsqrtps (%rdi), %ymm1 # sched: [36:3.00]
+; SANDY-NEXT: vsqrtps %ymm0, %ymm0 # sched: [29:3.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtps:
; HASWELL: # BB#0:
@@ -2368,16 +2368,16 @@ define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) {
;
; BTVER2-LABEL: test_sqrtps:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vsqrtps (%rdi), %ymm1 # sched: [26:21.00]
-; BTVER2-NEXT: vsqrtps %ymm0, %ymm0 # sched: [21:21.00]
-; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vsqrtps (%rdi), %ymm1 # sched: [47:42.00]
+; BTVER2-NEXT: vsqrtps %ymm0, %ymm0 # sched: [42:42.00]
+; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_sqrtps:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vsqrtps (%rdi), %ymm1 # sched: [26:21.00]
-; ZNVER1-NEXT: vsqrtps %ymm0, %ymm0 # sched: [21:21.00]
-; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vsqrtps (%rdi), %ymm1 # sched: [47:42.00]
+; ZNVER1-NEXT: vsqrtps %ymm0, %ymm0 # sched: [42:42.00]
+; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0)
%2 = load <8 x float>, <8 x float> *%a1, align 32
@@ -2391,8 +2391,8 @@ define <4 x double> @test_subpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; SANDY-LABEL: test_subpd:
; SANDY: # BB#0:
; SANDY-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subpd:
; HASWELL: # BB#0:
@@ -2402,14 +2402,14 @@ define <4 x double> @test_subpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
;
; BTVER2-LABEL: test_subpd:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_subpd:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; ZNVER1-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = fsub <4 x double> %a0, %a1
%2 = load <4 x double>, <4 x double> *%a2, align 32
@@ -2421,8 +2421,8 @@ define <8 x float> @test_subps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; SANDY-LABEL: test_subps:
; SANDY: # BB#0:
; SANDY-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subps:
; HASWELL: # BB#0:
@@ -2432,14 +2432,14 @@ define <8 x float> @test_subps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
;
; BTVER2-LABEL: test_subps:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_subps:
; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; ZNVER1-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = fsub <8 x float> %a0, %a1
%2 = load <8 x float>, <8 x float> *%a2, align 32
@@ -2451,11 +2451,11 @@ define i32 @test_testpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; SANDY-LABEL: test_testpd:
; SANDY: # BB#0:
; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33]
-; SANDY-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: setb %al # sched: [1:0.33]
-; SANDY-NEXT: vtestpd (%rdi), %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: setb %al # sched: [1:1.00]
+; SANDY-NEXT: vtestpd (%rdi), %xmm0 # sched: [7:1.00]
; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_testpd:
; HASWELL: # BB#0:
@@ -2495,12 +2495,12 @@ define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a
; SANDY-LABEL: test_testpd_ymm:
; SANDY: # BB#0:
; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33]
-; SANDY-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: setb %al # sched: [1:0.33]
-; SANDY-NEXT: vtestpd (%rdi), %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: setb %al # sched: [1:1.00]
+; SANDY-NEXT: vtestpd (%rdi), %ymm0 # sched: [8:1.00]
; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33]
; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_testpd_ymm:
; HASWELL: # BB#0:
@@ -2542,11 +2542,11 @@ define i32 @test_testps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; SANDY-LABEL: test_testps:
; SANDY: # BB#0:
; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33]
-; SANDY-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: setb %al # sched: [1:0.33]
-; SANDY-NEXT: vtestps (%rdi), %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vtestps %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: setb %al # sched: [1:1.00]
+; SANDY-NEXT: vtestps (%rdi), %xmm0 # sched: [7:1.00]
; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_testps:
; HASWELL: # BB#0:
@@ -2586,12 +2586,12 @@ define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2)
; SANDY-LABEL: test_testps_ymm:
; SANDY: # BB#0:
; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33]
-; SANDY-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: setb %al # sched: [1:0.33]
-; SANDY-NEXT: vtestps (%rdi), %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vtestps %ymm1, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: setb %al # sched: [1:1.00]
+; SANDY-NEXT: vtestps (%rdi), %ymm0 # sched: [8:1.00]
; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33]
; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_testps_ymm:
; HASWELL: # BB#0:
@@ -2635,7 +2635,7 @@ define <4 x double> @test_unpckhpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
; SANDY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; SANDY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [5:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpckhpd:
; HASWELL: # BB#0:
@@ -2648,14 +2648,14 @@ define <4 x double> @test_unpckhpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
; BTVER2: # BB#0:
; BTVER2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.50]
; BTVER2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [6:1.00]
-; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_unpckhpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.50]
; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [6:1.00]
-; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
%2 = load <4 x double>, <4 x double> *%a2, align 32
@@ -2669,7 +2669,7 @@ define <8 x float> @test_unpckhps(<8 x float> %a0, <8 x float> %a1, <8 x float>
; SANDY: # BB#0:
; SANDY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; SANDY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpckhps:
; HASWELL: # BB#0:
@@ -2698,9 +2698,9 @@ define <4 x double> @test_unpcklpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
; SANDY-LABEL: test_unpcklpd:
; SANDY: # BB#0:
; SANDY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SANDY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [5:1.00]
+; SANDY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [8:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpcklpd:
; HASWELL: # BB#0:
@@ -2713,14 +2713,14 @@ define <4 x double> @test_unpcklpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
; BTVER2: # BB#0:
; BTVER2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.50]
; BTVER2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [6:1.00]
-; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_unpcklpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.50]
; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [6:1.00]
-; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
%2 = load <4 x double>, <4 x double> *%a2, align 32
@@ -2733,8 +2733,8 @@ define <8 x float> @test_unpcklps(<8 x float> %a0, <8 x float> %a1, <8 x float>
; SANDY-LABEL: test_unpcklps:
; SANDY: # BB#0:
; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpcklps:
; HASWELL: # BB#0:
@@ -2762,10 +2762,10 @@ define <8 x float> @test_unpcklps(<8 x float> %a0, <8 x float> %a1, <8 x float>
define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
; SANDY-LABEL: test_xorpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_xorpd:
; HASWELL: # BB#0:
@@ -2778,14 +2778,14 @@ define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; BTVER2: # BB#0:
; BTVER2-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_xorpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = bitcast <4 x double> %a0 to <4 x i64>
%2 = bitcast <4 x double> %a1 to <4 x i64>
@@ -2801,10 +2801,10 @@ define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_xorps:
; SANDY: # BB#0:
-; SANDY-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_xorps:
; HASWELL: # BB#0:
@@ -2817,14 +2817,14 @@ define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; BTVER2: # BB#0:
; BTVER2-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_xorps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = bitcast <8 x float> %a0 to <4 x i64>
%2 = bitcast <8 x float> %a1 to <4 x i64>
@@ -2841,7 +2841,7 @@ define void @test_zeroall() {
; SANDY-LABEL: test_zeroall:
; SANDY: # BB#0:
; SANDY-NEXT: vzeroall # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_zeroall:
; HASWELL: # BB#0:
@@ -2866,7 +2866,7 @@ define void @test_zeroupper() {
; SANDY-LABEL: test_zeroupper:
; SANDY: # BB#0:
; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_zeroupper:
; HASWELL: # BB#0:
diff --git a/test/CodeGen/X86/avx-unpack.ll b/test/CodeGen/X86/avx-unpack.ll
index 6924d98b38b1..7826bc97eec5 100644
--- a/test/CodeGen/X86/avx-unpack.ll
+++ b/test/CodeGen/X86/avx-unpack.ll
@@ -1,57 +1,84 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s
-; CHECK: vunpckhps
define <8 x float> @unpackhips(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpackhips:
+; CHECK: # BB#0:
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <8 x float> %src1, <8 x float> %src2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
ret <8 x float> %shuffle.i
}
-; CHECK: vunpckhpd
define <4 x double> @unpackhipd(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpackhipd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <4 x double> %src1, <4 x double> %src2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
ret <4 x double> %shuffle.i
}
-; CHECK: vunpcklps
define <8 x float> @unpacklops(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpacklops:
+; CHECK: # BB#0:
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <8 x float> %src1, <8 x float> %src2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
ret <8 x float> %shuffle.i
}
-; CHECK: vunpcklpd
define <4 x double> @unpacklopd(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpacklopd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <4 x double> %src1, <4 x double> %src2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
ret <4 x double> %shuffle.i
}
-; CHECK-NOT: vunpcklps %ymm
-define <8 x float> @unpacklops-not(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp {
-entry:
+define <8 x float> @unpacklops_not(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp {
+; CHECK-LABEL: unpacklops_not:
+; CHECK: # BB#0:
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <8 x float> %src1, <8 x float> %src2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
ret <8 x float> %shuffle.i
}
-; CHECK-NOT: vunpcklpd %ymm
-define <4 x double> @unpacklopd-not(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp {
-entry:
+define <4 x double> @unpacklopd_not(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp {
+; CHECK-LABEL: unpacklopd_not:
+; CHECK: # BB#0:
+; CHECK-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <4 x double> %src1, <4 x double> %src2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
ret <4 x double> %shuffle.i
}
-; CHECK-NOT: vunpckhps %ymm
-define <8 x float> @unpackhips-not(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp {
-entry:
+define <8 x float> @unpackhips_not(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp {
+; CHECK-LABEL: unpackhips_not:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,2,u,3,u,4,u,5]
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,u,3,u,4,u,5,u]
+; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <8 x float> %src1, <8 x float> %src2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13>
ret <8 x float> %shuffle.i
}
-; CHECK-NOT: vunpckhpd %ymm
-define <4 x double> @unpackhipd-not(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp {
-entry:
+define <4 x double> @unpackhipd_not(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp {
+; CHECK-LABEL: unpackhipd_not:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <4 x double> %src1, <4 x double> %src2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
ret <4 x double> %shuffle.i
}
@@ -60,102 +87,135 @@ entry:
;;;; Unpack versions using the fp unit for int unpacking
;;;;
-; CHECK: vunpckhps
define <8 x i32> @unpackhips1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpackhips1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
ret <8 x i32> %shuffle.i
}
-; CHECK: vunpckhps (%
define <8 x i32> @unpackhips2(<8 x i32>* %src1, <8 x i32>* %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpackhips2:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm0
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT: retq
%a = load <8 x i32>, <8 x i32>* %src1
%b = load <8 x i32>, <8 x i32>* %src2
%shuffle.i = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
ret <8 x i32> %shuffle.i
}
-; CHECK: vunpckhpd
define <4 x i64> @unpackhipd1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpackhipd1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
ret <4 x i64> %shuffle.i
}
-; CHECK: vunpckhpd (%
define <4 x i64> @unpackhipd2(<4 x i64>* %src1, <4 x i64>* %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpackhipd2:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovapd (%rdi), %ymm0
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT: retq
%a = load <4 x i64>, <4 x i64>* %src1
%b = load <4 x i64>, <4 x i64>* %src2
%shuffle.i = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
ret <4 x i64> %shuffle.i
}
-; CHECK: vunpcklps
define <8 x i32> @unpacklops1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpacklops1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
ret <8 x i32> %shuffle.i
}
-; CHECK: vunpcklps (%
define <8 x i32> @unpacklops2(<8 x i32>* %src1, <8 x i32>* %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpacklops2:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm0
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT: retq
%a = load <8 x i32>, <8 x i32>* %src1
%b = load <8 x i32>, <8 x i32>* %src2
%shuffle.i = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
ret <8 x i32> %shuffle.i
}
-; CHECK: vunpcklpd
define <4 x i64> @unpacklopd1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpacklopd1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
ret <4 x i64> %shuffle.i
}
-; CHECK: vunpcklpd (%
define <4 x i64> @unpacklopd2(<4 x i64>* %src1, <4 x i64>* %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpacklopd2:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovapd (%rdi), %ymm0
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT: retq
%a = load <4 x i64>, <4 x i64>* %src1
%b = load <4 x i64>, <4 x i64>* %src2
%shuffle.i = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
ret <4 x i64> %shuffle.i
}
-; CHECK: vpunpckhwd
-; CHECK: vpunpckhwd
-; CHECK: vinsertf128
define <16 x i16> @unpackhwd_undef(<16 x i16> %src1) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpackhwd_undef:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
ret <16 x i16> %shuffle.i
}
-; CHECK: vpunpcklwd
-; CHECK: vpunpcklwd
-; CHECK: vinsertf128
define <16 x i16> @unpacklwd_undef(<16 x i16> %src1) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpacklwd_undef:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3]
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
ret <16 x i16> %shuffle.i
}
-; CHECK: vpunpckhbw
-; CHECK: vpunpckhbw
-; CHECK: vinsertf128
define <32 x i8> @unpackhbw_undef(<32 x i8> %src1, <32 x i8> %src2) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpackhbw_undef:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <32 x i8> %src1, <32 x i8> %src1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
ret <32 x i8> %shuffle.i
}
-; CHECK: vpunpcklbw
-; CHECK: vpunpcklbw
-; CHECK: vinsertf128
define <32 x i8> @unpacklbw_undef(<32 x i8> %src1) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: unpacklbw_undef:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%shuffle.i = shufflevector <32 x i8> %src1, <32 x i8> %src1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
ret <32 x i8> %shuffle.i
}
+
diff --git a/test/CodeGen/X86/avx-vinsertf128.ll b/test/CodeGen/X86/avx-vinsertf128.ll
index 38389de7a8a1..b7a4d5b5c308 100644
--- a/test/CodeGen/X86/avx-vinsertf128.ll
+++ b/test/CodeGen/X86/avx-vinsertf128.ll
@@ -1,30 +1,37 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s
-; CHECK-LABEL: A:
-; CHECK-NOT: vunpck
-; CHECK: vinsertf128 $1
define <8 x float> @A(<8 x float> %a) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: A:
+; CHECK: # BB#0:
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 0, i32 1, i32 2, i32 3>
ret <8 x float> %shuffle
}
-; CHECK-LABEL: B:
-; CHECK-NOT: vunpck
-; CHECK: vinsertf128 $1
define <4 x double> @B(<4 x double> %a) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: B:
+; CHECK: # BB#0:
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 1>
ret <4 x double> %shuffle
}
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
-
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
-; Just check that no crash happens
-; CHECK-LABEL: _insert_crash:
define void @insert_crash() nounwind {
+; CHECK-LABEL: insert_crash:
+; CHECK: # BB#0: # %allocas
+; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vminpd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vminsd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; CHECK-NEXT: vmovups %xmm0, (%rax)
+; CHECK-NEXT: retq
allocas:
%v1.i.i451 = shufflevector <4 x double> zeroinitializer, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%ret_0a.i.i.i452 = shufflevector <4 x double> %v1.i.i451, <4 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -40,72 +47,87 @@ allocas:
;; DAG Combine must remove useless vinsertf128 instructions
-; CHECK-LABEL: DAGCombineA:
-; CHECK-NOT: vinsertf128 $1
define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly {
- %1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x i32> %2
+; CHECK-LABEL: DAGCombineA:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
+ %t1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %t2 = shufflevector <8 x i32> %t1, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %t2
}
-; CHECK-LABEL: DAGCombineB:
-; CHECK: vpaddd %xmm
-; CHECK-NOT: vinsertf128 $1
-; CHECK: vpaddd %xmm
define <8 x i32> @DAGCombineB(<8 x i32> %v1, <8 x i32> %v2) nounwind readonly {
- %1 = add <8 x i32> %v1, %v2
- %2 = add <8 x i32> %1, %v1
- ret <8 x i32> %2
+; CHECK-LABEL: DAGCombineB:
+; CHECK: # BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %t1 = add <8 x i32> %v1, %v2
+ %t2 = add <8 x i32> %t1, %v1
+ ret <8 x i32> %t2
}
-; CHECK-LABEL: insert_undef_pd:
define <4 x double> @insert_undef_pd(<4 x double> %a0, <2 x double> %a1) {
-; CHECK: vmovaps %ymm1, %ymm0
+; CHECK-LABEL: insert_undef_pd:
+; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> undef, <2 x double> %a1, i8 0)
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
-
-; CHECK-LABEL: insert_undef_ps:
define <8 x float> @insert_undef_ps(<8 x float> %a0, <4 x float> %a1) {
-; CHECK: vmovaps %ymm1, %ymm0
+; CHECK-LABEL: insert_undef_ps:
+; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %a1, i8 0)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
-
-; CHECK-LABEL: insert_undef_si:
define <8 x i32> @insert_undef_si(<8 x i32> %a0, <4 x i32> %a1) {
-; CHECK: vmovaps %ymm1, %ymm0
+; CHECK-LABEL: insert_undef_si:
+; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> undef, <4 x i32> %a1, i8 0)
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
; rdar://10643481
-; CHECK-LABEL: vinsertf128_combine:
define <8 x float> @vinsertf128_combine(float* nocapture %f) nounwind uwtable readonly ssp {
-; CHECK-NOT: vmovaps
-; CHECK: vinsertf128
-entry:
+; CHECK-LABEL: vinsertf128_combine:
+; CHECK: # BB#0:
+; CHECK-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0
+; CHECK-NEXT: retq
%add.ptr = getelementptr inbounds float, float* %f, i64 4
- %0 = bitcast float* %add.ptr to <4 x float>*
- %1 = load <4 x float>, <4 x float>* %0, align 16
- %2 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %1, i8 1)
- ret <8 x float> %2
+ %t0 = bitcast float* %add.ptr to <4 x float>*
+ %t1 = load <4 x float>, <4 x float>* %t0, align 16
+ %t2 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %t1, i8 1)
+ ret <8 x float> %t2
}
; rdar://11076953
-; CHECK-LABEL: vinsertf128_ucombine:
define <8 x float> @vinsertf128_ucombine(float* nocapture %f) nounwind uwtable readonly ssp {
-; CHECK-NOT: vmovups
-; CHECK: vinsertf128
-entry:
+; CHECK-LABEL: vinsertf128_ucombine:
+; CHECK: # BB#0:
+; CHECK-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0
+; CHECK-NEXT: retq
%add.ptr = getelementptr inbounds float, float* %f, i64 4
- %0 = bitcast float* %add.ptr to <4 x float>*
- %1 = load <4 x float>, <4 x float>* %0, align 8
- %2 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %1, i8 1)
- ret <8 x float> %2
+ %t0 = bitcast float* %add.ptr to <4 x float>*
+ %t1 = load <4 x float>, <4 x float>* %t0, align 8
+ %t2 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %t1, i8 1)
+ ret <8 x float> %t2
}
+
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 971d03af3778..318c9cfd8a3f 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -633,13 +633,13 @@ entry:
define <8 x i32> @V111(<8 x i32> %in) nounwind uwtable readnone ssp {
; X32-AVX2-LABEL: V111:
; X32-AVX2: ## BB#0: ## %entry
-; X32-AVX2-NEXT: vpbroadcastd LCPI29_0, %ymm1
+; X32-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
; X32-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: V111:
; X64-AVX2: ## BB#0: ## %entry
-; X64-AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
; X64-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: retq
;
@@ -660,13 +660,13 @@ entry:
define <8 x float> @V113(<8 x float> %in) nounwind uwtable readnone ssp {
; X32-AVX2-LABEL: V113:
; X32-AVX2: ## BB#0: ## %entry
-; X32-AVX2-NEXT: vbroadcastss LCPI30_0, %ymm1
+; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125]
; X32-AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: V113:
; X64-AVX2: ## BB#0: ## %entry
-; X64-AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125]
; X64-AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: retq
;
@@ -687,12 +687,12 @@ entry:
define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: _e2:
; X32: ## BB#0:
-; X32-NEXT: vbroadcastss LCPI31_0, %xmm0
+; X32-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125]
; X32-NEXT: retl
;
; X64-LABEL: _e2:
; X64: ## BB#0:
-; X64-NEXT: vbroadcastss {{.*}}(%rip), %xmm0
+; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125]
; X64-NEXT: retq
%vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float 0xbf80000000000000, i32 1
diff --git a/test/CodeGen/X86/avx512-cmp.ll b/test/CodeGen/X86/avx512-cmp.ll
index eae7b94f5135..b5a13404a230 100644
--- a/test/CodeGen/X86/avx512-cmp.ll
+++ b/test/CodeGen/X86/avx512-cmp.ll
@@ -14,6 +14,7 @@ define double @test1(double %a, double %b) nounwind {
; ALL-NEXT: LBB0_2: ## %l2
; ALL-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; ALL-NEXT: retq
+; ALL-NEXT: ## -- End function
%tobool = fcmp une double %a, %b
br i1 %tobool, label %l1, label %l2
@@ -36,6 +37,7 @@ define float @test2(float %a, float %b) nounwind {
; ALL-NEXT: LBB1_2: ## %l2
; ALL-NEXT: vaddss %xmm1, %xmm0, %xmm0
; ALL-NEXT: retq
+; ALL-NEXT: ## -- End function
%tobool = fcmp olt float %a, %b
br i1 %tobool, label %l1, label %l2
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index 29a5325a0ae9..f858e7eb792f 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -12,6 +12,7 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; KNL-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test1:
; SKX: ## BB#0:
@@ -21,6 +22,7 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
; SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; SKX-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%rrr = load float, float* %br
%rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
%rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14
@@ -36,6 +38,7 @@ define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
; KNL-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; KNL-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test2:
; SKX: ## BB#0:
@@ -45,6 +48,7 @@ define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
; SKX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SKX-NEXT: vinsertf64x2 $3, %xmm0, %zmm2, %zmm0
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%rrr = load double, double* %br
%rrr2 = insertelement <8 x double> %x, double %rrr, i32 1
%rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6
@@ -58,6 +62,7 @@ define <16 x float> @test3(<16 x float> %x) nounwind {
; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
; KNL-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test3:
; SKX: ## BB#0:
@@ -65,6 +70,7 @@ define <16 x float> @test3(<16 x float> %x) nounwind {
; SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
; SKX-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%eee = extractelement <16 x float> %x, i32 4
%rrr2 = insertelement <16 x float> %x, float %eee, i32 1
ret <16 x float> %rrr2
@@ -78,6 +84,7 @@ define <8 x i64> @test4(<8 x i64> %x) nounwind {
; KNL-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1
; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test4:
; SKX: ## BB#0:
@@ -86,6 +93,7 @@ define <8 x i64> @test4(<8 x i64> %x) nounwind {
; SKX-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1
; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%eee = extractelement <8 x i64> %x, i32 4
%rrr2 = insertelement <8 x i64> %x, i64 %eee, i32 1
ret <8 x i64> %rrr2
@@ -96,11 +104,13 @@ define i32 @test5(<4 x float> %x) nounwind {
; KNL: ## BB#0:
; KNL-NEXT: vextractps $3, %xmm0, %eax
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test5:
; SKX: ## BB#0:
; SKX-NEXT: vextractps $3, %xmm0, %eax
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%ef = extractelement <4 x float> %x, i32 3
%ei = bitcast float %ef to i32
ret i32 %ei
@@ -111,11 +121,13 @@ define void @test6(<4 x float> %x, float* %out) nounwind {
; KNL: ## BB#0:
; KNL-NEXT: vextractps $3, %xmm0, (%rdi)
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test6:
; SKX: ## BB#0:
; SKX-NEXT: vextractps $3, %xmm0, (%rdi)
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%ef = extractelement <4 x float> %x, i32 3
store float %ef, float* %out, align 4
ret void
@@ -135,6 +147,7 @@ define float @test7(<16 x float> %x, i32 %ind) nounwind {
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test7:
; SKX: ## BB#0:
@@ -150,6 +163,7 @@ define float @test7(<16 x float> %x, i32 %ind) nounwind {
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%e = extractelement <16 x float> %x, i32 %ind
ret float %e
}
@@ -168,6 +182,7 @@ define double @test8(<8 x double> %x, i32 %ind) nounwind {
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test8:
; SKX: ## BB#0:
@@ -183,6 +198,7 @@ define double @test8(<8 x double> %x, i32 %ind) nounwind {
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%e = extractelement <8 x double> %x, i32 %ind
ret double %e
}
@@ -201,6 +217,7 @@ define float @test9(<8 x float> %x, i32 %ind) nounwind {
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test9:
; SKX: ## BB#0:
@@ -216,6 +233,7 @@ define float @test9(<8 x float> %x, i32 %ind) nounwind {
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%e = extractelement <8 x float> %x, i32 %ind
ret float %e
}
@@ -234,6 +252,7 @@ define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test10:
; SKX: ## BB#0:
@@ -249,6 +268,7 @@ define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%e = extractelement <16 x i32> %x, i32 %ind
ret i32 %e
}
@@ -1293,7 +1313,7 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
; KNL: ## BB#0:
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: setb %al
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; KNL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
@@ -1457,7 +1477,7 @@ define zeroext i8 @extractelement_v2i1_alt(<2 x i64> %a, <2 x i64> %b) {
define zeroext i8 @test_extractelement_v4i1(<4 x i32> %a, <4 x i32> %b) {
; KNL-LABEL: test_extractelement_v4i1:
; KNL: ## BB#0:
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; KNL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
@@ -2326,7 +2346,7 @@ define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b,
; KNL-LABEL: test_extractelement_varible_v4i1:
; KNL: ## BB#0:
; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; KNL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
index 2b04b9229b3d..b3fbceea80a9 100644
--- a/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -8,6 +8,7 @@ define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
; CHECK-NEXT: vcmpleps %zmm1, %zmm0, %k1
; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = fcmp ole <16 x float> %x, %y
%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
ret <16 x float> %max
@@ -19,6 +20,7 @@ define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind {
; CHECK-NEXT: vcmplepd %zmm1, %zmm0, %k1
; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = fcmp ole <8 x double> %x, %y
%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
ret <8 x double> %max
@@ -30,6 +32,7 @@ define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwin
; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%y = load <16 x i32>, <16 x i32>* %yp, align 4
%mask = icmp eq <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -42,6 +45,7 @@ define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1)
; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = icmp uge <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
ret <16 x i32> %max
@@ -53,6 +57,7 @@ define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind {
; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = icmp eq <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
ret <8 x i64> %max
@@ -64,6 +69,7 @@ define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1) noun
; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
; CHECK-NEXT: vpblendmq %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = icmp ugt <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x1, <8 x i64> %y
ret <8 x i64> %max
@@ -117,12 +123,14 @@ define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind {
; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test9:
; SKX: ## BB#0:
; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%mask = icmp eq <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
ret <8 x i32> %max
@@ -137,12 +145,14 @@ define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind {
; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test10:
; SKX: ## BB#0:
; SKX-NEXT: vcmpeqps %ymm1, %ymm0, %k1
; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%mask = fcmp oeq <8 x float> %x, %y
%max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
@@ -154,6 +164,7 @@ define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind {
; CHECK: ## BB#0:
; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = icmp ugt <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
ret <8 x i32> %max
@@ -168,6 +179,7 @@ define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test12:
; SKX: ## BB#0:
@@ -178,6 +190,7 @@ define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%res = icmp eq <16 x i64> %a, %b
%res1 = bitcast <16 x i1> %res to i16
ret i16 %res1
@@ -330,6 +343,7 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test12_v32i32:
; SKX: ## BB#0:
@@ -339,6 +353,7 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%res = icmp eq <32 x i32> %a, %b
%res1 = bitcast <32 x i1> %res to i32
ret i32 %res1
@@ -642,6 +657,7 @@ define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test12_v64i16:
; SKX: ## BB#0:
@@ -651,6 +667,7 @@ define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
; SKX-NEXT: kmovq %k0, %rax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%res = icmp eq <64 x i16> %a, %b
%res1 = bitcast <64 x i1> %res to i64
ret i64 %res1
@@ -704,6 +721,7 @@ define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind
; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k1
; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = icmp sge <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
ret <16 x i32> %max
@@ -715,6 +733,7 @@ define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp sgt <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -727,6 +746,7 @@ define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
; CHECK-NEXT: vpcmpled (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp sle <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -739,6 +759,7 @@ define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp ule <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -752,6 +773,7 @@ define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i3
; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 {%k1}
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask1 = icmp eq <16 x i32> %x1, %y1
%mask0 = icmp eq <16 x i32> %x, %y
%mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
@@ -766,6 +788,7 @@ define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y
; CHECK-NEXT: vpcmpleq %zmm2, %zmm3, %k1 {%k1}
; CHECK-NEXT: vpblendmq %zmm0, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask1 = icmp sge <8 x i64> %x1, %y1
%mask0 = icmp sle <8 x i64> %x, %y
%mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
@@ -780,6 +803,7 @@ define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i6
; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask1 = icmp sgt <8 x i64> %x1, %y1
%y = load <8 x i64>, <8 x i64>* %y.ptr, align 4
%mask0 = icmp sgt <8 x i64> %x, %y
@@ -795,6 +819,7 @@ define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16
; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask1 = icmp sge <16 x i32> %x1, %y1
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask0 = icmp ule <16 x i32> %x, %y
@@ -809,6 +834,7 @@ define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind {
; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1
; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
%y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -823,6 +849,7 @@ define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind
; CHECK-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
%y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -838,6 +865,7 @@ define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32
; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask1 = icmp sge <16 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
@@ -855,6 +883,7 @@ define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y
; CHECK-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask1 = icmp sge <8 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
@@ -920,12 +949,14 @@ define <4 x double> @test30(<4 x double> %x, <4 x double> %y) nounwind {
; KNL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm2
; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test30:
; SKX: ## BB#0:
; SKX-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%mask = fcmp oeq <4 x double> %x, %y
%max = select <4 x i1> %mask, <4 x double> %x, <4 x double> %y
@@ -938,12 +969,14 @@ define <2 x double> @test31(<2 x double> %x, <2 x double> %x1, <2 x double>* %yp
; KNL-NEXT: vcmpltpd (%rdi), %xmm0, %xmm2
; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test31:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi), %xmm0, %k1
; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%y = load <2 x double>, <2 x double>* %yp, align 4
%mask = fcmp olt <2 x double> %x, %y
@@ -957,12 +990,14 @@ define <4 x double> @test32(<4 x double> %x, <4 x double> %x1, <4 x double>* %yp
; KNL-NEXT: vcmpltpd (%rdi), %ymm0, %ymm2
; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test32:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi), %ymm0, %k1
; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%y = load <4 x double>, <4 x double>* %yp, align 4
%mask = fcmp ogt <4 x double> %y, %x
@@ -976,6 +1011,7 @@ define <8 x double> @test33(<8 x double> %x, <8 x double> %x1, <8 x double>* %yp
; CHECK-NEXT: vcmpltpd (%rdi), %zmm0, %k1
; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%y = load <8 x double>, <8 x double>* %yp, align 4
%mask = fcmp olt <8 x double> %x, %y
%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %x1
@@ -988,12 +1024,14 @@ define <4 x float> @test34(<4 x float> %x, <4 x float> %x1, <4 x float>* %yp) no
; KNL-NEXT: vcmpltps (%rdi), %xmm0, %xmm2
; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test34:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi), %xmm0, %k1
; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%y = load <4 x float>, <4 x float>* %yp, align 4
%mask = fcmp olt <4 x float> %x, %y
%max = select <4 x i1> %mask, <4 x float> %x, <4 x float> %x1
@@ -1010,12 +1048,14 @@ define <8 x float> @test35(<8 x float> %x, <8 x float> %x1, <8 x float>* %yp) no
; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test35:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi), %ymm0, %k1
; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%y = load <8 x float>, <8 x float>* %yp, align 4
%mask = fcmp ogt <8 x float> %y, %x
@@ -1029,6 +1069,7 @@ define <16 x float> @test36(<16 x float> %x, <16 x float> %x1, <16 x float>* %yp
; CHECK-NEXT: vcmpltps (%rdi), %zmm0, %k1
; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%y = load <16 x float>, <16 x float>* %yp, align 4
%mask = fcmp olt <16 x float> %x, %y
%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %x1
@@ -1041,6 +1082,7 @@ define <8 x double> @test37(<8 x double> %x, <8 x double> %x1, double* %ptr) nou
; CHECK-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1
; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%a = load double, double* %ptr
%v = insertelement <8 x double> undef, double %a, i32 0
@@ -1058,12 +1100,14 @@ define <4 x double> @test38(<4 x double> %x, <4 x double> %x1, double* %ptr) nou
; KNL-NEXT: vcmpltpd %ymm2, %ymm0, %ymm2
; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test38:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi){1to4}, %ymm0, %k1
; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%a = load double, double* %ptr
%v = insertelement <4 x double> undef, double %a, i32 0
@@ -1081,12 +1125,14 @@ define <2 x double> @test39(<2 x double> %x, <2 x double> %x1, double* %ptr) nou
; KNL-NEXT: vcmpltpd %xmm2, %xmm0, %xmm2
; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test39:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi){1to2}, %xmm0, %k1
; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%a = load double, double* %ptr
%v = insertelement <2 x double> undef, double %a, i32 0
@@ -1104,6 +1150,7 @@ define <16 x float> @test40(<16 x float> %x, <16 x float> %x1, float* %ptr) n
; CHECK-NEXT: vcmpltps (%rdi){1to16}, %zmm0, %k1
; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%a = load float, float* %ptr
%v = insertelement <16 x float> undef, float %a, i32 0
@@ -1124,12 +1171,14 @@ define <8 x float> @test41(<8 x float> %x, <8 x float> %x1, float* %ptr) noun
; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test41:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi){1to8}, %ymm0, %k1
; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%a = load float, float* %ptr
%v = insertelement <8 x float> undef, float %a, i32 0
@@ -1147,12 +1196,14 @@ define <4 x float> @test42(<4 x float> %x, <4 x float> %x1, float* %ptr) noun
; KNL-NEXT: vcmpltps %xmm2, %xmm0, %xmm2
; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test42:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi){1to4}, %xmm0, %k1
; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%a = load float, float* %ptr
%v = insertelement <4 x float> undef, float %a, i32 0
@@ -1172,6 +1223,7 @@ define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x
; KNL-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
; KNL-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test43:
; SKX: ## BB#0:
@@ -1180,6 +1232,7 @@ define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x
; SKX-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
; SKX-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%a = load double, double* %ptr
%v = insertelement <8 x double> undef, double %a, i32 0
diff --git a/test/CodeGen/X86/avx512vl-vec-cmp.ll b/test/CodeGen/X86/avx512vl-vec-cmp.ll
index e0acf2be653e..43b1f53a09fa 100644
--- a/test/CodeGen/X86/avx512vl-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512vl-vec-cmp.ll
@@ -1,56 +1,98 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=VLX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=NoVLX
define <4 x i64> @test256_1(<4 x i64> %x, <4 x i64> %y) nounwind {
-; CHECK-LABEL: test256_1:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_1:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; VLX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_1:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm2
+; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: retq
%mask = icmp eq <4 x i64> %x, %y
%max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y
ret <4 x i64> %max
}
define <4 x i64> @test256_2(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind {
-; CHECK-LABEL: test256_2:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
-; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_2:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
+; VLX-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_2:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; NoVLX-NEXT: retq
%mask = icmp sgt <4 x i64> %x, %y
%max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y
ret <4 x i64> %max
}
define <8 x i32> @test256_3(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1) nounwind {
-; CHECK-LABEL: test256_3:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k1
-; CHECK-NEXT: vpblendmd %ymm2, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_3:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k1
+; VLX-NEXT: vpblendmd %ymm2, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_3:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k1
+; NoVLX-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%mask = icmp sge <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x1, <8 x i32> %y
ret <8 x i32> %max
}
define <4 x i64> @test256_4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind {
-; CHECK-LABEL: test256_4:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k1
-; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_4:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpnleuq %ymm1, %ymm0, %k1
+; VLX-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_4:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm3, %ymm1, %ymm4
+; NoVLX-NEXT: vpxor %ymm3, %ymm0, %ymm0
+; NoVLX-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm0
+; NoVLX-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; NoVLX-NEXT: retq
%mask = icmp ugt <4 x i64> %x, %y
%max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y
ret <4 x i64> %max
}
define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
-; CHECK-LABEL: test256_5:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_5:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_5:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp eq <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -58,11 +100,21 @@ define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwin
}
define <8 x i32> @test256_5b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
-; CHECK-LABEL: test256_5b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_5b:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_5b:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpeqd %zmm0, %zmm2, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp eq <8 x i32> %y, %x
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -70,11 +122,21 @@ define <8 x i32> @test256_5b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
}
define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test256_6:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_6:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_6:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpgtd %zmm2, %zmm0, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp sgt <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -82,11 +144,21 @@ define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun
}
define <8 x i32> @test256_6b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test256_6b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_6b:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_6b:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpgtd %zmm2, %zmm0, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp slt <8 x i32> %y, %x
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -94,11 +166,21 @@ define <8 x i32> @test256_6b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou
}
define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test256_7:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_7:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_7:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpled %zmm2, %zmm0, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp sle <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -106,11 +188,21 @@ define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun
}
define <8 x i32> @test256_7b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test256_7b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_7b:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_7b:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpled %zmm2, %zmm0, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp sge <8 x i32> %y, %x
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -118,11 +210,21 @@ define <8 x i32> @test256_7b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou
}
define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test256_8:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_8:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpleud (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_8:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpleud %zmm2, %zmm0, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp ule <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -130,11 +232,21 @@ define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun
}
define <8 x i32> @test256_8b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test256_8b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_8b:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpleud (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_8b:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpnltud %zmm0, %zmm2, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp uge <8 x i32> %y, %x
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -142,12 +254,25 @@ define <8 x i32> @test256_8b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou
}
define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32> %y1) nounwind {
-; CHECK-LABEL: test256_9:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
-; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_9:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; VLX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 {%k1}
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_9:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
+; NoVLX-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpeqd %zmm3, %zmm2, %k0
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%mask1 = icmp eq <8 x i32> %x1, %y1
%mask0 = icmp eq <8 x i32> %x, %y
%mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
@@ -156,12 +281,22 @@ define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32>
}
define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) nounwind {
-; CHECK-LABEL: test256_10:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k1
-; CHECK-NEXT: vpcmpleq %ymm2, %ymm3, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_10:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpleq %ymm1, %ymm0, %k1
+; VLX-NEXT: vpcmpleq %ymm2, %ymm3, %k1 {%k1}
+; VLX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_10:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm3
+; NoVLX-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
+; NoVLX-NEXT: vpxor %ymm4, %ymm3, %ymm3
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
+; NoVLX-NEXT: vpandn %ymm3, %ymm1, %ymm1
+; NoVLX-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
+; NoVLX-NEXT: retq
%mask1 = icmp sge <4 x i64> %x1, %y1
%mask0 = icmp sle <4 x i64> %x, %y
%mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
@@ -170,12 +305,20 @@ define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64
}
define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind {
-; CHECK-LABEL: test256_11:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpgtq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_11:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpgtq %ymm2, %ymm1, %k1
+; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k1 {%k1}
+; VLX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_11:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
+; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm3
+; NoVLX-NEXT: vpand %ymm2, %ymm3, %ymm2
+; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: retq
%mask1 = icmp sgt <4 x i64> %x1, %y1
%y = load <4 x i64>, <4 x i64>* %y.ptr, align 4
%mask0 = icmp sgt <4 x i64> %x, %y
@@ -185,12 +328,25 @@ define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4
}
define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind {
-; CHECK-LABEL: test256_12:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1
-; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_12:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled %ymm1, %ymm2, %k1
+; VLX-NEXT: vpcmpleud (%rdi), %ymm0, %k1 {%k1}
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_12:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpled %zmm1, %zmm2, %k0
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpleud %zmm2, %zmm0, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%mask1 = icmp sge <8 x i32> %x1, %y1
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask0 = icmp ule <8 x i32> %x, %y
@@ -200,11 +356,18 @@ define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8
}
define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind {
-; CHECK-LABEL: test256_13:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_13:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k1
+; VLX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_13:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm2
+; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: retq
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0
%y = shufflevector <4 x i64> %y.0, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -214,11 +377,21 @@ define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind
}
define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind {
-; CHECK-LABEL: test256_14:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled (%rdi){1to8}, %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_14:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled (%rdi){1to8}, %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_14:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpled %zmm2, %zmm0, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0
%y = shufflevector <8 x i32> %y.0, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -228,12 +401,25 @@ define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind
}
define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind {
-; CHECK-LABEL: test256_15:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1
-; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_15:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled %ymm1, %ymm2, %k1
+; VLX-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k1 {%k1}
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_15:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpled %zmm1, %zmm2, %k0
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpgtd %zmm2, %zmm0, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%mask1 = icmp sge <8 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0
@@ -245,12 +431,21 @@ define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32
}
define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind {
-; CHECK-LABEL: test256_16:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpleq %ymm1, %ymm2, %k1
-; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_16:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpleq %ymm1, %ymm2, %k1
+; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k1 {%k1}
+; VLX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_16:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm3
+; NoVLX-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm3
+; NoVLX-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: retq
%mask1 = icmp sge <4 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0
@@ -262,11 +457,21 @@ define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64
}
define <8 x i32> @test256_17(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
-; CHECK-LABEL: test256_17:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpneqd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_17:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpneqd (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_17:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpneqd %zmm2, %zmm0, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp ne <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -274,11 +479,21 @@ define <8 x i32> @test256_17(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
}
define <8 x i32> @test256_18(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
-; CHECK-LABEL: test256_18:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpneqd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_18:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpneqd (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_18:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpneqd %zmm0, %zmm2, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp ne <8 x i32> %y, %x
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -286,11 +501,21 @@ define <8 x i32> @test256_18(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
}
define <8 x i32> @test256_19(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
-; CHECK-LABEL: test256_19:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpnltud (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_19:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpnltud (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_19:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpnltud %zmm2, %zmm0, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp uge <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -298,11 +523,21 @@ define <8 x i32> @test256_19(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
}
define <8 x i32> @test256_20(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
-; CHECK-LABEL: test256_20:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test256_20:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpleud (%rdi), %ymm0, %k1
+; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test256_20:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpnltud %zmm0, %zmm2, %k1
+; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp uge <8 x i32> %y, %x
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
@@ -310,55 +545,90 @@ define <8 x i32> @test256_20(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
}
define <2 x i64> @test128_1(<2 x i64> %x, <2 x i64> %y) nounwind {
-; CHECK-LABEL: test128_1:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
-; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_1:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
+; VLX-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_1:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2
+; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%mask = icmp eq <2 x i64> %x, %y
%max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y
ret <2 x i64> %max
}
define <2 x i64> @test128_2(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind {
-; CHECK-LABEL: test128_2:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
-; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_2:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
+; VLX-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_2:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%mask = icmp sgt <2 x i64> %x, %y
%max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y
ret <2 x i64> %max
}
define <4 x i32> @test128_3(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1) nounwind {
-; CHECK-LABEL: test128_3:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k1
-; CHECK-NEXT: vpblendmd %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_3:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k1
+; VLX-NEXT: vpblendmd %xmm2, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_3:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; NoVLX-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; NoVLX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%mask = icmp sge <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x1, <4 x i32> %y
ret <4 x i32> %max
}
define <2 x i64> @test128_4(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind {
-; CHECK-LABEL: test128_4:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1
-; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_4:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1
+; VLX-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_4:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; NoVLX-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm0
+; NoVLX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%mask = icmp ugt <2 x i64> %x, %y
%max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y
ret <2 x i64> %max
}
define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwind {
-; CHECK-LABEL: test128_5:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_5:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_5:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %yp, align 4
%mask = icmp eq <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -366,11 +636,17 @@ define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwin
}
define <4 x i32> @test128_5b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwind {
-; CHECK-LABEL: test128_5b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_5b:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_5b:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %yp, align 4
%mask = icmp eq <4 x i32> %y, %x
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -378,11 +654,17 @@ define <4 x i32> @test128_5b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwi
}
define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test128_6:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_6:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_6:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp sgt <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -390,11 +672,17 @@ define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun
}
define <4 x i32> @test128_6b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test128_6b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_6b:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_6b:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp slt <4 x i32> %y, %x
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -402,11 +690,19 @@ define <4 x i32> @test128_6b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
}
define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test128_7:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_7:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_7:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp sle <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -414,11 +710,19 @@ define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun
}
define <4 x i32> @test128_7b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test128_7b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_7b:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_7b:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp sge <4 x i32> %y, %x
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -426,11 +730,18 @@ define <4 x i32> @test128_7b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
}
define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test128_8:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_8:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpleud (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_8:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpminud (%rdi), %xmm0, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp ule <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -438,11 +749,19 @@ define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun
}
define <4 x i32> @test128_8b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test128_8b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_8b:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpleud (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_8b:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vmovdqu (%rdi), %xmm2
+; NoVLX-NEXT: vpmaxud %xmm0, %xmm2, %xmm3
+; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp uge <4 x i32> %y, %x
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -450,12 +769,20 @@ define <4 x i32> @test128_8b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
}
define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32> %y1) nounwind {
-; CHECK-LABEL: test128_9:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_9:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; VLX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 {%k1}
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_9:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm3
+; NoVLX-NEXT: vpand %xmm2, %xmm3, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%mask1 = icmp eq <4 x i32> %x1, %y1
%mask0 = icmp eq <4 x i32> %x, %y
%mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
@@ -464,12 +791,22 @@ define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32>
}
define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) nounwind {
-; CHECK-LABEL: test128_10:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k1
-; CHECK-NEXT: vpcmpleq %xmm2, %xmm3, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %xmm0, %xmm2, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_10:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpleq %xmm1, %xmm0, %k1
+; VLX-NEXT: vpcmpleq %xmm2, %xmm3, %k1 {%k1}
+; VLX-NEXT: vpblendmq %xmm0, %xmm2, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_10:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm3
+; NoVLX-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; NoVLX-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1
+; NoVLX-NEXT: vpandn %xmm3, %xmm1, %xmm1
+; NoVLX-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; NoVLX-NEXT: retq
%mask1 = icmp sge <2 x i64> %x1, %y1
%mask0 = icmp sle <2 x i64> %x, %y
%mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer
@@ -478,12 +815,20 @@ define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64
}
define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind {
-; CHECK-LABEL: test128_11:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpgtq %xmm2, %xmm1, %k1
-; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_11:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpgtq %xmm2, %xmm1, %k1
+; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k1 {%k1}
+; VLX-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_11:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
+; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm3
+; NoVLX-NEXT: vpand %xmm2, %xmm3, %xmm2
+; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%mask1 = icmp sgt <2 x i64> %x1, %y1
%y = load <2 x i64>, <2 x i64>* %y.ptr, align 4
%mask0 = icmp sgt <2 x i64> %x, %y
@@ -493,12 +838,21 @@ define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2
}
define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind {
-; CHECK-LABEL: test128_12:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1
-; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_12:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled %xmm1, %xmm2, %k1
+; VLX-NEXT: vpcmpleud (%rdi), %xmm0, %k1 {%k1}
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_12:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2
+; NoVLX-NEXT: vpminud (%rdi), %xmm0, %xmm3
+; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3
+; NoVLX-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%mask1 = icmp sge <4 x i32> %x1, %y1
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask0 = icmp ule <4 x i32> %x, %y
@@ -508,11 +862,18 @@ define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4
}
define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind {
-; CHECK-LABEL: test128_13:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k1
-; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_13:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k1
+; VLX-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_13:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm2
+; NoVLX-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm2
+; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0
%y = insertelement <2 x i64> %y.0, i64 %yb, i32 1
@@ -522,11 +883,20 @@ define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind
}
define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind {
-; CHECK-LABEL: test128_14:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled (%rdi){1to4}, %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_14:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled (%rdi){1to4}, %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_14:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0
%y = shufflevector <4 x i32> %y.0, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -536,12 +906,21 @@ define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind
}
define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind {
-; CHECK-LABEL: test128_15:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1
-; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_15:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpled %xmm1, %xmm2, %k1
+; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k1 {%k1}
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_15:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm3
+; NoVLX-NEXT: vpcmpgtd %xmm3, %xmm0, %xmm3
+; NoVLX-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%mask1 = icmp sge <4 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0
@@ -553,12 +932,21 @@ define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32
}
define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind {
-; CHECK-LABEL: test128_16:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpleq %xmm1, %xmm2, %k1
-; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_16:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpleq %xmm1, %xmm2, %k1
+; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k1 {%k1}
+; VLX-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_16:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm3
+; NoVLX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; NoVLX-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%mask1 = icmp sge <2 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0
@@ -570,11 +958,19 @@ define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64
}
define <4 x i32> @test128_17(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test128_17:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpneqd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_17:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpneqd (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_17:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp ne <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -582,11 +978,19 @@ define <4 x i32> @test128_17(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
}
define <4 x i32> @test128_18(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test128_18:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpneqd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_18:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpneqd (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_18:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp ne <4 x i32> %y, %x
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -594,11 +998,18 @@ define <4 x i32> @test128_18(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
}
define <4 x i32> @test128_19(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test128_19:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpnltud (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_19:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpnltud (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_19:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vpmaxud (%rdi), %xmm0, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp uge <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
@@ -606,11 +1017,19 @@ define <4 x i32> @test128_19(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
}
define <4 x i32> @test128_20(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
-; CHECK-LABEL: test128_20:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; VLX-LABEL: test128_20:
+; VLX: # BB#0:
+; VLX-NEXT: vpcmpleud (%rdi), %xmm0, %k1
+; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test128_20:
+; NoVLX: # BB#0:
+; NoVLX-NEXT: vmovdqu (%rdi), %xmm2
+; NoVLX-NEXT: vpmaxud %xmm0, %xmm2, %xmm3
+; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp uge <4 x i32> %y, %x
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
diff --git a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
index f297fc3db95f..4d3a1495617e 100644
--- a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
+++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
@@ -1,13 +1,124 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -check-prefix=NoVLX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=VLX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=NoVLX
define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqb_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi0:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi2:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi3:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi4:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi5:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi6:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi7:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -18,11 +129,122 @@ entry:
}
define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqb (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqb (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi8:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi9:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi10:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi11:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi12:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi13:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi14:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi15:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -34,12 +256,124 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi16:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi17:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi18:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi19:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi20:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi21:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi22:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi23:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -52,12 +386,124 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi24:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi25:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi26:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi27:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi28:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi29:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi30:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi31:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -72,11 +518,127 @@ entry:
define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqb_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi32:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi33:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi34:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi35:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi36:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi37:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi38:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi39:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -87,11 +649,127 @@ entry:
}
define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqb (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqb (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi40:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi41:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi42:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi43:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi44:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi45:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi46:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi47:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -103,12 +781,129 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi48:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi49:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi50:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi51:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi52:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi53:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi54:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi55:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -121,12 +916,129 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi56:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi57:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi58:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi59:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi60:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi61:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi62:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi63:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -141,12 +1053,46 @@ entry:
define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqb_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqb %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi64:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi65:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi66:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%1 = bitcast <4 x i64> %__b to <32 x i8>
@@ -157,12 +1103,46 @@ entry:
}
define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqb (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqb (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi67:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi68:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi69:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqb (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -174,13 +1154,56 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi70:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi71:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi72:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%1 = bitcast <4 x i64> %__b to <32 x i8>
@@ -193,13 +1216,56 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqb (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqb (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi73:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi74:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi75:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; NoVLX-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4
+; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -214,11 +1280,24 @@ entry:
define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -229,11 +1308,24 @@ entry:
}
define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -245,12 +1337,26 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -263,12 +1369,26 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -283,11 +1403,72 @@ entry:
define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi76:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi77:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi78:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -298,11 +1479,72 @@ entry:
}
define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi79:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi80:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi81:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -314,12 +1556,74 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi82:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi83:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi84:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -332,12 +1636,74 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi85:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi86:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi87:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -352,11 +1718,77 @@ entry:
define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi88:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi89:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi90:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -367,11 +1799,77 @@ entry:
}
define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi91:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi92:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi93:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -383,12 +1881,79 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi94:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi95:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi96:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -401,12 +1966,79 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi97:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi98:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi99:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -421,12 +2053,123 @@ entry:
define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi100:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi101:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi102:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi103:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi104:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi105:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi106:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi107:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -437,12 +2180,123 @@ entry:
}
define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi108:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi109:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi110:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi111:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi112:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi113:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi114:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi115:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -454,13 +2308,125 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi116:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi117:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi118:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi119:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi120:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi121:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi122:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi123:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -473,13 +2439,125 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi124:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi125:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi126:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi127:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi128:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi129:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi130:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi131:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -494,12 +2572,128 @@ entry:
define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi132:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi133:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi134:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi135:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi136:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi137:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi138:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi139:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -510,12 +2704,128 @@ entry:
}
define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi140:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi141:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi142:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi143:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi144:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi145:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi146:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi147:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -527,13 +2837,130 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi148:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi149:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi150:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi151:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi152:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi153:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi154:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi155:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -546,13 +2973,130 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi156:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi157:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi158:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi159:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi160:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi161:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi162:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi163:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -567,12 +3111,348 @@ entry:
define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi164:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi165:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi166:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vmovq %xmm3, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm2, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%1 = bitcast <8 x i64> %__b to <32 x i16>
@@ -583,12 +3463,263 @@ entry:
}
define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqw (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqw (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi167:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi168:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi169:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm1
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpeqw 32(%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %eax, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -600,13 +3731,358 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi170:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi171:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi172:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm3
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm3, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm5, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
+; NoVLX-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm4
+; NoVLX-NEXT: vpmovdb %zmm6, %xmm6
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm8, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3
+; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpand %xmm6, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%1 = bitcast <8 x i64> %__b to <32 x i16>
@@ -619,13 +4095,273 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqw (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi173:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi174:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi175:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; NoVLX-NEXT: vmovq %xmm1, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm3, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm2
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3
+; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm3, %ymm3
+; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %eax, %xmm3
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpcmpeqw 32(%rsi), %ymm4, %ymm4
+; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4
+; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4
+; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -640,11 +4376,51 @@ entry:
define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -655,11 +4431,51 @@ entry:
}
define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -671,12 +4487,70 @@ entry:
}
define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -690,12 +4564,70 @@ entry:
}
define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -711,11 +4643,52 @@ entry:
define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -728,12 +4701,71 @@ entry:
}
define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -750,11 +4782,50 @@ entry:
define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -765,11 +4836,50 @@ entry:
}
define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -781,12 +4891,69 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -800,12 +4967,69 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -821,11 +5045,51 @@ entry:
define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -838,12 +5102,70 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -860,11 +5182,39 @@ entry:
define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi176:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi177:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi178:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -875,11 +5225,39 @@ entry:
}
define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi179:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi180:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi181:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -891,12 +5269,58 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi182:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi183:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi184:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -910,12 +5334,58 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi185:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi186:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi187:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -931,11 +5401,40 @@ entry:
define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi188:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi189:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi190:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -948,12 +5447,59 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi191:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi192:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi193:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -970,11 +5516,46 @@ entry:
define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi194:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi195:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi196:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -985,11 +5566,46 @@ entry:
}
define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi197:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi198:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi199:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -1001,12 +5617,65 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi200:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi201:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi202:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -1020,12 +5689,65 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi203:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi204:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi205:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -1041,11 +5763,47 @@ entry:
define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi206:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi207:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi208:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -1058,12 +5816,66 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi209:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi210:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi211:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -1080,21 +5892,23 @@ entry:
define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -1106,21 +5920,23 @@ entry:
}
define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -1133,23 +5949,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -1163,23 +5981,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -1195,21 +6015,23 @@ entry:
define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -1223,23 +6045,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -1256,12 +6080,72 @@ entry:
define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi212:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi213:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi214:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -1272,12 +6156,72 @@ entry:
}
define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi215:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi216:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi217:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -1289,13 +6233,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi218:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi219:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi220:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -1308,13 +6314,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi221:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi222:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi223:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -1329,12 +6397,72 @@ entry:
define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi224:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi225:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi226:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -1347,13 +6475,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi227:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi228:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi229:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -1369,12 +6559,77 @@ entry:
define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi230:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi231:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi232:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -1385,12 +6640,77 @@ entry:
}
define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi233:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi234:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi235:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -1402,13 +6722,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi236:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi237:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi238:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -1421,13 +6808,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi239:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi240:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi241:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -1442,12 +6896,77 @@ entry:
define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi242:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi243:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi244:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -1460,13 +6979,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi245:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi246:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi247:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -1482,12 +7068,120 @@ entry:
define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi248:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi249:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi250:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi251:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi252:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi253:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi254:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi255:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -1498,12 +7192,120 @@ entry:
}
define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi256:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi257:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi258:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi259:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi260:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi261:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi262:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi263:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -1515,13 +7317,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi264:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi265:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi266:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi267:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi268:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi269:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi270:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi271:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -1534,13 +7445,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi272:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi273:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi274:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi275:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi276:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi277:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi278:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi279:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -1555,12 +7575,120 @@ entry:
define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi280:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi281:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi282:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi283:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi284:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi285:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi286:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi287:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -1573,13 +7701,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi288:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi289:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi290:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi291:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi292:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi293:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi294:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi295:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -1595,12 +7832,125 @@ entry:
define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi296:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi297:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi298:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi299:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi300:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi301:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi302:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi303:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -1611,12 +7961,125 @@ entry:
}
define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi304:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi305:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi306:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi307:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi308:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi309:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi310:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi311:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -1628,13 +8091,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi312:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi313:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi314:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi315:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi316:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi317:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi318:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi319:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -1647,13 +8224,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi320:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi321:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi322:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi323:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi324:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi325:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi326:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi327:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -1668,12 +8359,125 @@ entry:
define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi328:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi329:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi330:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi331:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi332:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi333:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi334:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi335:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -1686,13 +8490,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi336:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi337:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi338:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi339:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi340:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi341:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi342:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi343:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -1708,12 +8626,23 @@ entry:
define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -1724,12 +8653,23 @@ entry:
}
define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -1741,13 +8681,34 @@ entry:
}
define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -1761,13 +8722,34 @@ entry:
}
define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -1783,12 +8765,24 @@ entry:
define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -1801,13 +8795,35 @@ entry:
}
define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -1824,11 +8840,35 @@ entry:
define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -1839,11 +8879,35 @@ entry:
}
define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -1855,12 +8919,46 @@ entry:
}
define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -1874,12 +8972,46 @@ entry:
}
define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -1895,11 +9027,36 @@ entry:
define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -1912,12 +9069,47 @@ entry:
}
define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -1934,11 +9126,34 @@ entry:
define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -1949,11 +9164,34 @@ entry:
}
define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -1965,12 +9203,45 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -1984,12 +9255,45 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -2005,11 +9309,35 @@ entry:
define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -2022,12 +9350,46 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -2044,11 +9406,39 @@ entry:
define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi344:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi345:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi346:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -2059,11 +9449,39 @@ entry:
}
define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi347:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi348:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi349:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -2075,12 +9493,50 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi350:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi351:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi352:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -2094,12 +9550,50 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi353:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi354:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi355:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -2115,11 +9609,40 @@ entry:
define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi356:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi357:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi358:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -2132,12 +9655,51 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi359:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi360:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi361:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -2154,11 +9716,46 @@ entry:
define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi362:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi363:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi364:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -2169,11 +9766,46 @@ entry:
}
define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi365:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi366:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi367:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -2185,12 +9817,57 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi368:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi369:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi370:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -2204,12 +9881,57 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi371:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi372:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi373:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -2225,11 +9947,47 @@ entry:
define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi374:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi375:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi376:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -2242,12 +10000,58 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi377:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi378:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi379:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -2264,12 +10068,53 @@ entry:
define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2280,12 +10125,53 @@ entry:
}
define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -2297,13 +10183,72 @@ entry:
}
define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2317,13 +10262,72 @@ entry:
}
define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -2339,12 +10343,54 @@ entry:
define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -2357,13 +10403,73 @@ entry:
}
define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -2380,12 +10486,52 @@ entry:
define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2396,12 +10542,52 @@ entry:
}
define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -2413,13 +10599,71 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2433,13 +10677,71 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -2455,12 +10757,53 @@ entry:
define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -2473,13 +10816,72 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -2496,12 +10898,41 @@ entry:
define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi380:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi381:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi382:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2512,12 +10943,41 @@ entry:
}
define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi383:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi384:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi385:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -2529,13 +10989,60 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi386:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi387:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi388:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2549,13 +11056,60 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi389:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi390:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi391:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -2571,12 +11125,42 @@ entry:
define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi392:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi393:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi394:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -2589,13 +11173,61 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi395:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi396:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi397:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -2612,12 +11244,48 @@ entry:
define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi398:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi399:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi400:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2628,12 +11296,48 @@ entry:
}
define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi401:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi402:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi403:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -2645,13 +11349,67 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi404:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi405:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi406:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2665,13 +11423,67 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi407:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi408:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi409:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -2687,12 +11499,49 @@ entry:
define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi410:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi411:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi412:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -2705,13 +11554,68 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi413:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi414:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi415:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -2728,12 +11632,20 @@ entry:
define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -2744,12 +11656,20 @@ entry:
}
define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -2761,13 +11681,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -2780,13 +11709,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -2801,12 +11739,20 @@ entry:
define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -2819,13 +11765,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -2841,12 +11796,70 @@ entry:
define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi416:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi417:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi418:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -2857,12 +11870,70 @@ entry:
}
define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi419:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi420:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi421:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -2874,13 +11945,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi422:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi423:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi424:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -2893,13 +12023,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi425:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi426:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi427:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -2914,12 +12103,70 @@ entry:
define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi428:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi429:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi430:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -2932,13 +12179,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi431:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi432:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi433:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -2954,12 +12260,75 @@ entry:
define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi434:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi435:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi436:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -2970,12 +12339,75 @@ entry:
}
define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi437:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi438:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi439:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -2987,13 +12419,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi440:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi441:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi442:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -3006,13 +12502,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi443:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi444:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi445:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -3027,12 +12587,75 @@ entry:
define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi446:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi447:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi448:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -3045,13 +12668,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi449:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi450:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi451:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -3067,11 +12754,122 @@ entry:
define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi452:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi453:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi454:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi455:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi456:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi457:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi458:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi459:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -3082,11 +12880,122 @@ entry:
}
define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtb (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtb (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi460:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi461:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi462:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi463:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi464:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi465:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi466:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi467:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3098,12 +13007,124 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi468:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi469:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi470:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi471:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi472:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi473:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi474:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi475:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -3116,12 +13137,124 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi476:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi477:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi478:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi479:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi480:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi481:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi482:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi483:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3136,11 +13269,127 @@ entry:
define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi484:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi485:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi486:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi487:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi488:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi489:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi490:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi491:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -3151,11 +13400,127 @@ entry:
}
define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtb (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtb (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi492:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi493:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi494:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi495:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi496:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi497:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi498:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi499:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3167,12 +13532,129 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi500:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi501:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi502:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi503:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi504:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi505:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi506:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi507:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -3185,12 +13667,129 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi508:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi509:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi510:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi511:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi512:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi513:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi514:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi515:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3205,12 +13804,46 @@ entry:
define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtb %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi516:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi517:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi518:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%1 = bitcast <4 x i64> %__b to <32 x i8>
@@ -3221,12 +13854,46 @@ entry:
}
define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtb (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtb (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi519:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi520:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi521:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtb (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -3238,13 +13905,56 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi522:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi523:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi524:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; NoVLX-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%1 = bitcast <4 x i64> %__b to <32 x i8>
@@ -3257,13 +13967,56 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtb (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtb (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi525:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi526:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi527:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; NoVLX-NEXT: vpcmpgtb (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4
+; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -3278,11 +14031,24 @@ entry:
define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -3293,11 +14059,24 @@ entry:
}
define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3309,12 +14088,26 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -3327,12 +14120,26 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3347,11 +14154,72 @@ entry:
define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi528:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi529:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi530:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -3362,11 +14230,72 @@ entry:
}
define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi531:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi532:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi533:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3378,12 +14307,74 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi534:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi535:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi536:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -3396,12 +14387,74 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi537:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi538:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi539:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3416,11 +14469,77 @@ entry:
define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi540:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi541:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi542:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -3431,11 +14550,77 @@ entry:
}
define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi543:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi544:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi545:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3447,12 +14632,79 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi546:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi547:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi548:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -3465,12 +14717,79 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi549:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi550:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi551:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3485,12 +14804,123 @@ entry:
define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi552:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi553:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi554:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi555:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi556:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi557:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi558:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi559:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -3501,12 +14931,123 @@ entry:
}
define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi560:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi561:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi562:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi563:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi564:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi565:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi566:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi567:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -3518,13 +15059,125 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi568:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi569:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi570:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi571:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi572:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi573:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi574:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi575:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -3537,13 +15190,125 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi576:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi577:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi578:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi579:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi580:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi581:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi582:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi583:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -3558,12 +15323,128 @@ entry:
define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi584:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi585:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi586:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi587:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi588:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi589:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi590:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi591:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -3574,12 +15455,128 @@ entry:
}
define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi592:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi593:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi594:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi595:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi596:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi597:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi598:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi599:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -3591,13 +15588,130 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi600:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi601:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi602:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi603:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi604:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi605:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi606:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi607:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -3610,13 +15724,130 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi608:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi609:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi610:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi611:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi612:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi613:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi614:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi615:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -3631,12 +15862,348 @@ entry:
define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi616:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi617:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi618:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vmovq %xmm3, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm2, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%1 = bitcast <8 x i64> %__b to <32 x i16>
@@ -3647,12 +16214,263 @@ entry:
}
define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtw (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtw (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi619:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi620:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi621:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm1
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw 32(%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %eax, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -3664,13 +16482,358 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi622:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi623:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi624:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm3
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm3, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm5, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
+; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4
+; NoVLX-NEXT: vpmovdb %zmm6, %xmm6
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm8, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3
+; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpand %xmm6, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%1 = bitcast <8 x i64> %__b to <32 x i16>
@@ -3683,13 +16846,273 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtw (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtw (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi625:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi626:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi627:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; NoVLX-NEXT: vmovq %xmm1, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm3, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm2
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3
+; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm3, %ymm3
+; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %eax, %xmm3
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpcmpgtw 32(%rsi), %ymm4, %ymm4
+; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4
+; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4
+; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -3704,11 +17127,51 @@ entry:
define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -3719,11 +17182,51 @@ entry:
}
define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3735,12 +17238,70 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -3754,12 +17315,70 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3775,11 +17394,52 @@ entry:
define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -3792,12 +17452,71 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -3814,11 +17533,50 @@ entry:
define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -3829,11 +17587,50 @@ entry:
}
define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3845,12 +17642,69 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -3864,12 +17718,69 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3885,11 +17796,51 @@ entry:
define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -3902,12 +17853,70 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -3924,11 +17933,39 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi628:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi629:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi630:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -3939,11 +17976,39 @@ entry:
}
define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi631:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi632:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi633:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3955,12 +18020,58 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi634:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi635:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi636:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -3974,12 +18085,58 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi637:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi638:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi639:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -3995,11 +18152,40 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi640:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi641:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi642:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -4012,12 +18198,59 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi643:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi644:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi645:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -4034,11 +18267,46 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi646:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi647:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi648:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -4049,11 +18317,46 @@ entry:
}
define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi649:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi650:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi651:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -4065,12 +18368,65 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi652:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi653:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi654:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -4084,12 +18440,65 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi655:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi656:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi657:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -4105,11 +18514,47 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi658:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi659:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi660:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -4122,12 +18567,66 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi661:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi662:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi663:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -4144,21 +18643,23 @@ entry:
define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -4170,21 +18671,23 @@ entry:
}
define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -4197,23 +18700,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -4227,23 +18732,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -4259,21 +18766,23 @@ entry:
define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -4287,23 +18796,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -4320,12 +18831,72 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi664:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi665:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi666:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -4336,12 +18907,72 @@ entry:
}
define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi667:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi668:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi669:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -4353,13 +18984,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi670:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi671:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi672:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -4372,13 +19065,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi673:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi674:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi675:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -4393,12 +19148,72 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi676:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi677:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi678:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -4411,13 +19226,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi679:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi680:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi681:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -4433,12 +19310,77 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi682:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi683:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi684:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -4449,12 +19391,77 @@ entry:
}
define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi685:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi686:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi687:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -4466,13 +19473,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi688:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi689:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi690:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -4485,13 +19559,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi691:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi692:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi693:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -4506,12 +19647,77 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi694:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi695:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi696:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -4524,13 +19730,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi697:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi698:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi699:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -4546,12 +19819,120 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi700:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi701:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi702:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi703:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi704:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi705:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi706:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi707:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -4562,12 +19943,120 @@ entry:
}
define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi708:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi709:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi710:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi711:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi712:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi713:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi714:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi715:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -4579,13 +20068,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi716:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi717:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi718:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi719:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi720:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi721:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi722:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi723:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -4598,13 +20196,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi724:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi725:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi726:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi727:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi728:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi729:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi730:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi731:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -4619,12 +20326,120 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi732:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi733:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi734:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi735:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi736:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi737:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi738:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi739:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -4637,13 +20452,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi740:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi741:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi742:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi743:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi744:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi745:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi746:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi747:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -4659,12 +20583,125 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi748:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi749:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi750:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi751:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi752:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi753:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi754:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi755:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -4675,12 +20712,125 @@ entry:
}
define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi756:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi757:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi758:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi759:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi760:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi761:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi762:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi763:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -4692,13 +20842,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi764:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi765:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi766:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi767:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi768:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi769:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi770:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi771:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -4711,13 +20975,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi772:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi773:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi774:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi775:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi776:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi777:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi778:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi779:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -4732,12 +21110,125 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi780:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi781:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi782:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi783:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi784:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi785:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi786:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi787:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -4750,13 +21241,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi788:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi789:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi790:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi791:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi792:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi793:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi794:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi795:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -4772,12 +21377,23 @@ entry:
define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -4788,12 +21404,23 @@ entry:
}
define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -4805,13 +21432,34 @@ entry:
}
define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -4825,13 +21473,34 @@ entry:
}
define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -4847,12 +21516,24 @@ entry:
define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -4865,13 +21546,35 @@ entry:
}
define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -4888,11 +21591,35 @@ entry:
define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -4903,11 +21630,35 @@ entry:
}
define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -4919,12 +21670,46 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -4938,12 +21723,46 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -4959,11 +21778,36 @@ entry:
define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -4976,12 +21820,47 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -4998,11 +21877,34 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -5013,11 +21915,34 @@ entry:
}
define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -5029,12 +21954,45 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -5048,12 +22006,45 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -5069,11 +22060,35 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -5086,12 +22101,46 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -5108,11 +22157,39 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi796:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi797:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi798:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -5123,11 +22200,39 @@ entry:
}
define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi799:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi800:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi801:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -5139,12 +22244,50 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi802:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi803:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi804:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -5158,12 +22301,50 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi805:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi806:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi807:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -5179,11 +22360,40 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi808:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi809:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi810:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -5196,12 +22406,51 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi811:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi812:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi813:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -5218,11 +22467,46 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi814:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi815:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi816:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -5233,11 +22517,46 @@ entry:
}
define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi817:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi818:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi819:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -5249,12 +22568,57 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi820:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi821:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi822:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -5268,12 +22632,57 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi823:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi824:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi825:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -5289,11 +22698,47 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi826:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi827:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi828:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -5306,12 +22751,58 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi829:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi830:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi831:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -5328,12 +22819,53 @@ entry:
define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5344,12 +22876,53 @@ entry:
}
define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -5361,13 +22934,72 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5381,13 +23013,72 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -5403,12 +23094,54 @@ entry:
define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -5421,13 +23154,73 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -5444,12 +23237,52 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5460,12 +23293,52 @@ entry:
}
define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -5477,13 +23350,71 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5497,13 +23428,71 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -5519,12 +23508,53 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -5537,13 +23567,72 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -5560,12 +23649,41 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi832:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi833:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi834:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5576,12 +23694,41 @@ entry:
}
define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi835:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi836:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi837:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -5593,13 +23740,60 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi838:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi839:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi840:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5613,13 +23807,60 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi841:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi842:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi843:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -5635,12 +23876,42 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi844:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi845:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi846:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -5653,13 +23924,61 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi847:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi848:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi849:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -5676,12 +23995,48 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi850:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi851:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi852:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5692,12 +24047,48 @@ entry:
}
define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi853:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi854:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi855:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -5709,13 +24100,67 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi856:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi857:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi858:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5729,13 +24174,67 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi859:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi860:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi861:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -5751,12 +24250,49 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi862:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi863:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi864:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -5769,13 +24305,68 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi865:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi866:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi867:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -5792,12 +24383,20 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -5808,12 +24407,20 @@ entry:
}
define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -5825,13 +24432,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -5844,13 +24460,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -5865,12 +24490,20 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -5883,13 +24516,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -5905,12 +24547,70 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi868:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi869:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi870:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -5921,12 +24621,70 @@ entry:
}
define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi871:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi872:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi873:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -5938,13 +24696,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi874:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi875:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi876:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -5957,13 +24774,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi877:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi878:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi879:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -5978,12 +24854,70 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi880:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi881:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi882:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -5996,13 +24930,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi883:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi884:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi885:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -6018,12 +25011,75 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi886:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi887:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi888:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -6034,12 +25090,75 @@ entry:
}
define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi889:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi890:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi891:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -6051,13 +25170,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi892:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi893:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi894:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -6070,13 +25253,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi895:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi896:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi897:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -6091,12 +25338,75 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi898:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi899:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi900:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -6109,13 +25419,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi901:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi902:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi903:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -6131,11 +25505,124 @@ entry:
define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleb %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi904:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi905:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi906:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi907:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi908:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi909:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi910:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi911:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -6146,11 +25633,125 @@ entry:
}
define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltb (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltb (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi912:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi913:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi914:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi915:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi916:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi917:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi918:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi919:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6162,12 +25763,126 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi920:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi921:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi922:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi923:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi924:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi925:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi926:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi927:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -6180,12 +25895,127 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi928:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi929:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi930:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi931:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi932:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi933:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi934:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi935:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6200,11 +26030,129 @@ entry:
define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleb %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi936:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi937:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi938:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi939:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi940:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi941:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi942:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi943:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -6215,11 +26163,130 @@ entry:
}
define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltb (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltb (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi944:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi945:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi946:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi947:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi948:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi949:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi950:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi951:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6231,12 +26298,131 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi952:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi953:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi954:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi955:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi956:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi957:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi958:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi959:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -6249,12 +26435,132 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi960:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi961:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi962:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi963:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi964:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi965:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi966:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi967:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6269,12 +26575,48 @@ entry:
define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleb %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi968:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi969:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi970:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%1 = bitcast <4 x i64> %__b to <32 x i8>
@@ -6285,12 +26627,49 @@ entry:
}
define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltb (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltb (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi971:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi972:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi973:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -6302,13 +26681,58 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleb %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi974:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi975:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi976:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%1 = bitcast <4 x i64> %__b to <32 x i8>
@@ -6321,13 +26745,59 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltb (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltb (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi977:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi978:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi979:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm4
+; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm4, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
+; NoVLX-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4
+; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -6342,11 +26812,26 @@ entry:
define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -6357,11 +26842,27 @@ entry:
}
define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6373,12 +26874,28 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -6391,12 +26908,29 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6411,11 +26945,74 @@ entry:
define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi980:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi981:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi982:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -6426,11 +27023,75 @@ entry:
}
define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi983:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi984:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi985:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6442,12 +27103,76 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi986:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi987:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi988:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -6460,12 +27185,77 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi989:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi990:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi991:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6480,11 +27270,79 @@ entry:
define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi992:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi993:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi994:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -6495,11 +27353,80 @@ entry:
}
define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi995:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi996:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi997:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6511,12 +27438,81 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi998:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi999:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1000:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -6529,12 +27525,82 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1001:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1002:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1003:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6549,12 +27615,125 @@ entry:
define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmplew %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1004:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1005:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1006:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1007:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1008:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1009:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1010:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1011:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -6565,12 +27744,126 @@ entry:
}
define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltw (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltw (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1012:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1013:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1014:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1015:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1016:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1017:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1018:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1019:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -6582,13 +27875,127 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1020:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1021:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1022:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1023:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1024:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1025:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1026:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1027:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -6601,13 +28008,128 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1028:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1029:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1030:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1031:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1032:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1033:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1034:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1035:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -6622,12 +28144,130 @@ entry:
define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmplew %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1036:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1037:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1038:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1039:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1040:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1041:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1042:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1043:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -6638,12 +28278,131 @@ entry:
}
define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltw (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltw (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1044:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1045:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1046:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1047:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1048:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1049:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1050:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1051:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -6655,13 +28414,132 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1052:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1053:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1054:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1055:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1056:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1057:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1058:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1059:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -6674,13 +28552,133 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1060:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1061:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1062:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1063:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1064:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1065:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1066:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1067:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -6695,12 +28693,351 @@ entry:
define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmplew %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmplew %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1068:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1069:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1070:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vmovq %xmm3, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm2, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm2
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%1 = bitcast <8 x i64> %__b to <32 x i16>
@@ -6711,12 +29048,268 @@ entry:
}
define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltw (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltw (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1071:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1072:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1073:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm1
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm2
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
+; NoVLX-NEXT: vmovdqa 32(%rdi), %ymm2
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -6728,13 +29321,361 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmplew %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmplew %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1074:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1075:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1076:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm3
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm3, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm5, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4
+; NoVLX-NEXT: vpmovdb %zmm6, %xmm6
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpgtw %ymm8, %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
+; NoVLX-NEXT: vpxor %ymm5, %ymm2, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3
+; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpand %xmm6, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%1 = bitcast <8 x i64> %__b to <32 x i16>
@@ -6747,13 +29688,278 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltw (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltw (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1077:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1078:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1079:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; NoVLX-NEXT: vmovq %xmm1, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm3, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm2
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm5
+; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm5, %ymm3
+; NoVLX-NEXT: vmovdqa 32(%rsi), %ymm5
+; NoVLX-NEXT: vpcmpgtw %ymm4, %ymm5, %ymm4
+; NoVLX-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
+; NoVLX-NEXT: vpxor %ymm5, %ymm3, %ymm3
+; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm4
+; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4
+; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4
+; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -6768,11 +29974,53 @@ entry:
define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -6783,11 +30031,54 @@ entry:
}
define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6799,12 +30090,70 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -6818,12 +30167,71 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6839,12 +30247,55 @@ entry:
define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -6857,13 +30308,72 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -6880,11 +30390,52 @@ entry:
define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -6895,11 +30446,53 @@ entry:
}
define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6911,12 +30504,69 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -6930,12 +30580,70 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -6951,12 +30659,54 @@ entry:
define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -6969,13 +30719,71 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -6992,11 +30800,41 @@ entry:
define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1080:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1081:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1082:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -7007,11 +30845,42 @@ entry:
}
define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1083:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1084:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1085:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -7023,12 +30892,58 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1086:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1087:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1088:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -7042,12 +30957,59 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1089:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1090:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1091:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -7063,12 +31025,43 @@ entry:
define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1092:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1093:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1094:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -7081,13 +31074,60 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1095:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1096:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1097:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -7104,11 +31144,48 @@ entry:
define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1098:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1099:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1100:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -7119,11 +31196,49 @@ entry:
}
define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1101:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1102:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1103:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -7135,12 +31250,65 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1104:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1105:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1106:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -7154,12 +31322,66 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1107:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1108:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1109:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -7175,12 +31397,50 @@ entry:
define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1110:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1111:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1112:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -7193,13 +31453,67 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1113:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1114:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1115:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -7216,21 +31530,23 @@ entry:
define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -7242,21 +31558,23 @@ entry:
}
define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -7269,23 +31587,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -7299,23 +31619,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -7331,22 +31653,24 @@ entry:
define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rdi), %ymm1
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem_b:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -7360,24 +31684,26 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rsi), %ymm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -7394,12 +31720,72 @@ entry:
define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1116:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1117:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1118:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -7410,12 +31796,72 @@ entry:
}
define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1119:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1120:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1121:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -7427,13 +31873,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1122:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1123:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1124:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -7446,13 +31954,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1125:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1126:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1127:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -7467,13 +32037,73 @@ entry:
define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rdi), %ymm1
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1128:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1129:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1130:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -7486,14 +32116,76 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rsi), %ymm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1131:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1132:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1133:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -7509,12 +32201,77 @@ entry:
define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1134:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1135:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1136:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -7525,12 +32282,77 @@ entry:
}
define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1137:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1138:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1139:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -7542,13 +32364,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1140:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1141:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1142:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -7561,13 +32450,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1143:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1144:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1145:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -7582,13 +32538,78 @@ entry:
define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rdi), %ymm1
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1146:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1147:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1148:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -7601,14 +32622,81 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rsi), %ymm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1149:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1150:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1151:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -7624,12 +32712,120 @@ entry:
define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1152:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1153:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1154:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1155:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1156:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1157:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1158:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1159:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -7640,12 +32836,120 @@ entry:
}
define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltd (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1160:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1161:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1162:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1163:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1164:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1165:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1166:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1167:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -7657,13 +32961,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1168:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1169:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1170:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1171:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1172:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1173:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1174:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1175:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -7676,13 +33089,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1176:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1177:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1178:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1179:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1180:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1181:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1182:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1183:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -7697,13 +33219,122 @@ entry:
define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rdi), %zmm1
-; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rdi), %zmm1
+; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1184:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1185:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1186:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1187:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1188:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1189:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1190:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1191:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpbroadcastd (%rdi), %zmm1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -7716,14 +33347,124 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rsi), %zmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rsi), %zmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1192:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1193:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1194:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1195:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1196:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1197:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1198:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1199:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpbroadcastd (%rsi), %zmm1
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -7739,12 +33480,125 @@ entry:
define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1200:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1201:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1202:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1203:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1204:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1205:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1206:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1207:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -7755,12 +33609,125 @@ entry:
}
define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltd (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1208:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1209:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1210:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1211:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1212:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1213:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1214:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1215:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -7772,13 +33739,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1216:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1217:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1218:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1219:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1220:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1221:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1222:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1223:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -7791,13 +33872,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1224:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1225:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1226:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1227:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1228:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1229:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1230:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1231:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -7812,13 +34007,127 @@ entry:
define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rdi), %zmm1
-; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rdi), %zmm1
+; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1232:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1233:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1234:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1235:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1236:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1237:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1238:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1239:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpbroadcastd (%rdi), %zmm1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -7831,14 +34140,129 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastd (%rsi), %zmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastd (%rsi), %zmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1240:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1241:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1242:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1243:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1244:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1245:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1246:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1247:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpbroadcastd (%rsi), %zmm1
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -7854,12 +34278,25 @@ entry:
define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -7870,12 +34307,26 @@ entry:
}
define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -7887,13 +34338,34 @@ entry:
}
define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -7907,13 +34379,35 @@ entry:
}
define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -7929,13 +34423,27 @@ entry:
define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -7948,14 +34456,36 @@ entry:
}
define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -7972,11 +34502,37 @@ entry:
define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -7987,11 +34543,38 @@ entry:
}
define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -8003,12 +34586,46 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -8022,12 +34639,47 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -8043,12 +34695,39 @@ entry:
define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -8061,13 +34740,48 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -8084,11 +34798,36 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -8099,11 +34838,37 @@ entry:
}
define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -8115,12 +34880,45 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -8134,12 +34932,46 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -8155,12 +34987,38 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -8173,13 +35031,47 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -8196,11 +35088,41 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1248:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1249:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1250:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -8211,11 +35133,42 @@ entry:
}
define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1251:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1252:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1253:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -8227,12 +35180,50 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1254:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1255:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1256:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -8246,12 +35237,51 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1257:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1258:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1259:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -8267,12 +35297,43 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1260:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1261:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1262:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -8285,13 +35346,52 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1263:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1264:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1265:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -8308,11 +35408,48 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1266:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1267:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1268:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -8323,11 +35460,49 @@ entry:
}
define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1269:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1270:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1271:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -8339,12 +35514,57 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1272:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1273:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1274:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -8358,12 +35578,58 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1275:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1276:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1277:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -8379,12 +35645,50 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1278:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1279:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1280:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -8397,13 +35701,59 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1281:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1282:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1283:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -8420,12 +35770,55 @@ entry:
define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -8436,12 +35829,56 @@ entry:
}
define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -8453,13 +35890,74 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -8473,13 +35971,75 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -8495,13 +36055,57 @@ entry:
define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -8514,14 +36118,76 @@ entry:
}
define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -8538,12 +36204,54 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -8554,12 +36262,55 @@ entry:
}
define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -8571,13 +36322,73 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -8591,13 +36402,74 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -8613,13 +36485,56 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -8632,14 +36547,75 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -8656,12 +36632,43 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1284:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1285:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1286:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -8672,12 +36679,44 @@ entry:
}
define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1287:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1288:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1289:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -8689,13 +36728,62 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1290:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1291:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1292:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -8709,13 +36797,63 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1293:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1294:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1295:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -8731,13 +36869,45 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1296:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1297:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1298:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -8750,14 +36920,64 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1299:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1300:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1301:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -8774,12 +36994,50 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1302:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1303:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1304:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -8790,12 +37048,51 @@ entry:
}
define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1305:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1306:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1307:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -8807,13 +37104,69 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1308:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1309:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1310:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -8827,13 +37180,70 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1311:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1312:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1313:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -8849,13 +37259,52 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1314:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1315:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1316:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -8868,14 +37317,71 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1317:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1318:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1319:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -8892,12 +37398,20 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -8908,12 +37422,20 @@ entry:
}
define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -8925,13 +37447,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -8944,13 +37475,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -8965,13 +37505,22 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %zmm1
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %zmm1
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -8984,14 +37533,24 @@ entry:
}
define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %zmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %zmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -9007,12 +37566,70 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1320:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1321:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1322:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -9023,12 +37640,70 @@ entry:
}
define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1323:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1324:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1325:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -9040,13 +37715,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1326:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1327:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1328:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -9059,13 +37793,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1329:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1330:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1331:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -9080,13 +37873,72 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %zmm1
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %zmm1
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1332:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1333:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1334:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -9099,14 +37951,74 @@ entry:
}
define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %zmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %zmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1335:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1336:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1337:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -9122,12 +38034,75 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1338:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1339:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1340:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -9138,12 +38113,75 @@ entry:
}
define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1341:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1342:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1343:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -9155,13 +38193,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1344:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1345:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1346:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -9174,13 +38276,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1347:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1348:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1349:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -9195,13 +38361,77 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rdi), %zmm1
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rdi), %zmm1
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1350:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1351:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1352:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -9214,14 +38444,79 @@ entry:
}
define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpbroadcastq (%rsi), %zmm1
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpbroadcastq (%rsi), %zmm1
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1353:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1354:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1355:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -9237,11 +38532,125 @@ entry:
define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultb_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1356:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1357:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1358:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1359:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1360:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1361:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1362:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1363:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -9252,11 +38661,125 @@ entry:
}
define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltub (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltub (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1364:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1365:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1366:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1367:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1368:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1369:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1370:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1371:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9268,12 +38791,127 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1372:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1373:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1374:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1375:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1376:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1377:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1378:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1379:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -9286,12 +38924,127 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1380:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1381:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1382:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1383:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1384:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1385:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1386:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1387:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9306,11 +39059,130 @@ entry:
define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultb_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1388:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1389:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1390:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1391:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1392:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1393:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1394:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1395:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -9321,11 +39193,130 @@ entry:
}
define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltub (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltub (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1396:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1397:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1398:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1399:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1400:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1401:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1402:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1403:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9337,12 +39328,132 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1404:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1405:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1406:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1407:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1408:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1409:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1410:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1411:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -9355,12 +39466,132 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1412:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1413:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1414:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1415:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1416:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1417:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1418:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1419:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9375,12 +39606,49 @@ entry:
define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultb_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltub %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1420:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1421:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1422:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%1 = bitcast <4 x i64> %__b to <32 x i8>
@@ -9391,12 +39659,49 @@ entry:
}
define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltub (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltub (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1423:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1424:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1425:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -9408,13 +39713,59 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1426:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1427:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1428:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %ymm5, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm5, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%1 = bitcast <4 x i64> %__b to <32 x i8>
@@ -9427,13 +39778,59 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltub (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltub (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1429:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1430:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1431:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rsi), %ymm4, %ymm4
+; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm4, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4
+; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -9448,11 +39845,27 @@ entry:
define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -9463,11 +39876,27 @@ entry:
}
define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9479,12 +39908,29 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -9497,12 +39943,29 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9517,11 +39980,75 @@ entry:
define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1432:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1433:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1434:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -9532,11 +40059,75 @@ entry:
}
define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1435:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1436:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1437:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9548,12 +40139,77 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1438:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1439:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1440:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -9566,12 +40222,77 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1441:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1442:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1443:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9586,11 +40307,80 @@ entry:
define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1444:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1445:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1446:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -9601,11 +40391,80 @@ entry:
}
define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1447:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1448:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1449:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9617,12 +40476,82 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1450:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1451:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1452:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -9635,12 +40564,82 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1453:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1454:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1455:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9655,12 +40654,126 @@ entry:
define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1456:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1457:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1458:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1459:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1460:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1461:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1462:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1463:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -9671,12 +40784,126 @@ entry:
}
define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1464:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1465:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1466:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1467:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1468:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1469:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1470:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1471:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -9688,13 +40915,128 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1472:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1473:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1474:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1475:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1476:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1477:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1478:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1479:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -9707,13 +41049,128 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1480:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1481:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1482:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1483:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1484:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1485:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1486:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1487:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -9728,12 +41185,131 @@ entry:
define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1488:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1489:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1490:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1491:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1492:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1493:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1494:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1495:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -9744,12 +41320,131 @@ entry:
}
define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1496:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1497:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1498:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1499:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1500:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1501:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1502:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1503:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -9761,13 +41456,133 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1504:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1505:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1506:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1507:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1508:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1509:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1510:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1511:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -9780,13 +41595,133 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1512:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1513:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1514:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1515:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1516:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1517:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1518:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1519:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -9801,12 +41736,353 @@ entry:
define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1520:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1521:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1522:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vmovq %xmm3, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm2, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm2, %ymm3, %ymm3
+; NoVLX-NEXT: vpxor %ymm2, %ymm4, %ymm4
+; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm4, %ymm3
+; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpmovsxbd %xmm3, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%1 = bitcast <8 x i64> %__b to <32 x i16>
@@ -9817,12 +42093,268 @@ entry:
}
define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuw (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuw (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1523:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1524:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1525:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm1
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm2
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2
+; NoVLX-NEXT: vpxor 32(%rdi), %ymm1, %ymm3
+; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -9834,13 +42366,363 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1526:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1527:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1528:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm3
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm3, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm5, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm5
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm8
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm0
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm7
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm6 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm6, %ymm4, %ymm3
+; NoVLX-NEXT: vpxor %ymm6, %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm4
+; NoVLX-NEXT: vpxor %ymm6, %ymm8, %ymm2
+; NoVLX-NEXT: vpxor %ymm6, %ymm5, %ymm3
+; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpand %xmm7, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm0, %xmm4, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%1 = bitcast <8 x i64> %__b to <32 x i16>
@@ -9853,13 +42735,278 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuw (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuw (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1529:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1530:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1531:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; NoVLX-NEXT: vmovq %xmm1, %rax
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm4
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm5
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm5, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm5
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm6
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm7
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm2
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm4
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm7, %xmm3
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm5 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm5, %ymm3, %ymm3
+; NoVLX-NEXT: vpxor (%rsi), %ymm5, %ymm6
+; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm6, %ymm3
+; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %eax, %xmm3
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm4
+; NoVLX-NEXT: vpxor 32(%rsi), %ymm5, %ymm5
+; NoVLX-NEXT: vpcmpgtw %ymm4, %ymm5, %ymm4
+; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4
+; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4
+; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %ecx
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: shlq $32, %rax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -9874,11 +43021,54 @@ entry:
define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -9889,11 +43079,54 @@ entry:
}
define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9905,12 +43138,73 @@ entry:
}
define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -9924,12 +43218,73 @@ entry:
}
define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -9945,11 +43300,55 @@ entry:
define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -9962,12 +43361,74 @@ entry:
}
define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -9984,11 +43445,53 @@ entry:
define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -9999,11 +43502,53 @@ entry:
}
define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -10015,12 +43560,72 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -10034,12 +43639,72 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -10055,11 +43720,54 @@ entry:
define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -10072,12 +43780,73 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -10094,11 +43863,42 @@ entry:
define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1532:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1533:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1534:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -10109,11 +43909,42 @@ entry:
}
define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1535:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1536:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1537:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -10125,12 +43956,61 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1538:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1539:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1540:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -10144,12 +44024,61 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1541:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1542:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1543:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -10165,11 +44094,43 @@ entry:
define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1544:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1545:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1546:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -10182,12 +44143,62 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1547:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1548:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1549:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -10204,11 +44215,49 @@ entry:
define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1550:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1551:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1552:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -10219,11 +44268,49 @@ entry:
}
define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1553:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1554:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1555:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -10235,12 +44322,68 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1556:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1557:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1558:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -10254,12 +44397,68 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1559:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1560:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1561:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -10275,11 +44474,50 @@ entry:
define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1562:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1563:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1564:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -10292,12 +44530,69 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1565:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1566:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1567:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
%load = load i32, i32* %__b
@@ -10314,21 +44609,23 @@ entry:
define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -10340,21 +44637,23 @@ entry:
}
define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -10367,23 +44666,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -10397,23 +44698,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -10429,21 +44732,23 @@ entry:
define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem_b:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -10457,23 +44762,25 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -10490,12 +44797,72 @@ entry:
define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1568:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1569:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1570:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -10506,12 +44873,72 @@ entry:
}
define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1571:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1572:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1573:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -10523,13 +44950,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1574:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1575:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1576:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -10542,13 +45031,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1577:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1578:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1579:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -10563,12 +45114,72 @@ entry:
define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1580:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1581:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1582:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -10581,13 +45192,75 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1583:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1584:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1585:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -10603,12 +45276,77 @@ entry:
define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1586:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1587:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1588:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -10619,12 +45357,77 @@ entry:
}
define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1589:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1590:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1591:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -10636,13 +45439,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1592:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1593:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1594:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -10655,13 +45525,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1595:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1596:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1597:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k1, %k0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -10676,12 +45613,77 @@ entry:
define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1598:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1599:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1600:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -10694,13 +45696,80 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1601:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1602:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1603:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: kandw %k0, %k1, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
%load = load i32, i32* %__b
@@ -10716,12 +45785,120 @@ entry:
define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1604:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1605:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1606:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1607:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1608:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1609:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1610:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1611:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -10732,12 +45909,120 @@ entry:
}
define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1612:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1613:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1614:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1615:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1616:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1617:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1618:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1619:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -10749,13 +46034,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1620:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1621:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1622:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1623:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1624:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1625:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1626:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1627:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -10768,13 +46162,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1628:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1629:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1630:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1631:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1632:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1633:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1634:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1635:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -10789,12 +46292,120 @@ entry:
define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1636:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1637:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1638:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1639:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1640:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1641:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1642:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1643:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -10807,13 +46418,122 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1644:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1645:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1646:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1647:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1648:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1649:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1650:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1651:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -10829,12 +46549,125 @@ entry:
define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1652:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1653:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1654:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1655:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1656:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1657:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1658:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1659:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -10845,12 +46678,125 @@ entry:
}
define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1660:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1661:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1662:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1663:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1664:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1665:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1666:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1667:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -10862,13 +46808,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1668:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1669:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1670:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1671:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1672:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1673:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1674:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1675:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -10881,13 +46941,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1676:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1677:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1678:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1679:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1680:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1681:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1682:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1683:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -10902,12 +47076,125 @@ entry:
define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1684:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1685:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1686:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1687:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1688:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1689:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1690:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1691:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -10920,13 +47207,127 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1692:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1693:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1694:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1695:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1696:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1697:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1698:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1699:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
%load = load i32, i32* %__b
@@ -10942,12 +47343,26 @@ entry:
define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v4i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -10958,12 +47373,26 @@ entry:
}
define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -10975,13 +47404,37 @@ entry:
}
define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -10995,13 +47448,37 @@ entry:
}
define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -11017,12 +47494,27 @@ entry:
define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -11035,13 +47527,38 @@ entry:
}
define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -11058,11 +47575,38 @@ entry:
define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -11073,11 +47617,38 @@ entry:
}
define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -11089,12 +47660,49 @@ entry:
}
define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -11108,12 +47716,49 @@ entry:
}
define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -11129,11 +47774,39 @@ entry:
define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -11146,12 +47819,50 @@ entry:
}
define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -11168,11 +47879,37 @@ entry:
define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -11183,11 +47920,37 @@ entry:
}
define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -11199,12 +47962,48 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -11218,12 +48017,48 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -11239,11 +48074,38 @@ entry:
define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -11256,12 +48118,49 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -11278,11 +48177,42 @@ entry:
define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1700:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1701:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1702:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -11293,11 +48223,42 @@ entry:
}
define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1703:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1704:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1705:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -11309,12 +48270,53 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1706:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1707:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1708:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -11328,12 +48330,53 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1709:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1710:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1711:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -11349,11 +48392,43 @@ entry:
define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1712:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1713:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1714:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -11366,12 +48441,54 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1715:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1716:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1717:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -11388,11 +48505,49 @@ entry:
define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1718:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1719:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1720:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -11403,11 +48558,49 @@ entry:
}
define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1721:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1722:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1723:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -11419,12 +48612,60 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1724:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1725:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1726:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -11438,12 +48679,60 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1727:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1728:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1729:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -11459,11 +48748,50 @@ entry:
define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1730:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1731:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1732:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -11476,12 +48804,61 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1733:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1734:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1735:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
%load = load i64, i64* %__b
@@ -11498,12 +48875,56 @@ entry:
define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -11514,12 +48935,56 @@ entry:
}
define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -11531,13 +48996,75 @@ entry:
}
define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -11551,13 +49078,75 @@ entry:
}
define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -11573,12 +49162,57 @@ entry:
define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -11591,13 +49225,76 @@ entry:
}
define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -11614,12 +49311,55 @@ entry:
define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -11630,12 +49370,55 @@ entry:
}
define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -11647,13 +49430,74 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -11667,13 +49511,74 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -11689,12 +49594,56 @@ entry:
define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -11707,13 +49656,75 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -11730,12 +49741,44 @@ entry:
define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1736:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1737:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1738:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -11746,12 +49789,44 @@ entry:
}
define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1739:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1740:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1741:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -11763,13 +49838,63 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1742:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1743:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1744:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -11783,13 +49908,63 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1745:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1746:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1747:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -11805,12 +49980,45 @@ entry:
define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1748:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1749:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1750:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -11823,13 +50031,64 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1751:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1752:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1753:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -11846,12 +50105,51 @@ entry:
define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1754:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1755:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1756:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -11862,12 +50160,51 @@ entry:
}
define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1757:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1758:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1759:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -11879,13 +50216,70 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1760:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1761:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1762:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -11899,13 +50293,70 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1763:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1764:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1765:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -11921,12 +50372,52 @@ entry:
define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1766:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1767:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1768:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -11939,13 +50430,71 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1769:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1770:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1771:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: kmovw %edi, %k0
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftlw $13, %k0, %k2
+; NoVLX-NEXT: kshiftrw $15, %k2, %k2
+; NoVLX-NEXT: kshiftlw $15, %k0, %k3
+; NoVLX-NEXT: kshiftrw $15, %k3, %k3
+; NoVLX-NEXT: kshiftlw $14, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kmovw %k3, %ecx
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k2, %eax
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
%load = load i64, i64* %__b
@@ -11962,12 +50511,20 @@ entry:
define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -11978,12 +50535,20 @@ entry:
}
define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -11995,13 +50560,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -12014,13 +50588,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -12035,12 +50618,20 @@ entry:
define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -12053,13 +50644,22 @@ entry:
}
define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -12075,12 +50675,70 @@ entry:
define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1772:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1773:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1774:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -12091,12 +50749,70 @@ entry:
}
define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1775:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1776:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1777:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -12108,13 +50824,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1778:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1779:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1780:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -12127,13 +50902,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1781:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1782:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1783:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -12148,12 +50982,70 @@ entry:
define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1784:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1785:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1786:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -12166,13 +51058,72 @@ entry:
}
define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1787:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1788:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1789:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -12188,12 +51139,75 @@ entry:
define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1790:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1791:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1792:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -12204,12 +51218,75 @@ entry:
}
define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1793:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1794:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1795:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -12221,13 +51298,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1796:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1797:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1798:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -12240,13 +51381,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1799:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1800:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1801:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -12261,12 +51466,75 @@ entry:
define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1802:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1803:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1804:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -12279,13 +51547,77 @@ entry:
}
define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1805:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1806:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1807:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
%load = load i64, i64* %__b
@@ -12302,11 +51634,51 @@ entry:
declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32)
define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%1 = bitcast <2 x i64> %__b to <4 x float>
@@ -12317,11 +51689,51 @@ entry:
}
define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -12333,11 +51745,52 @@ entry:
}
define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%load = load float, float* %__b
@@ -12351,11 +51804,50 @@ entry:
define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%1 = bitcast <2 x i64> %__b to <4 x float>
@@ -12366,11 +51858,50 @@ entry:
}
define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -12382,11 +51913,51 @@ entry:
}
define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%load = load float, float* %__b
@@ -12400,11 +51971,39 @@ entry:
define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1808:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1809:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1810:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%1 = bitcast <2 x i64> %__b to <4 x float>
@@ -12415,11 +52014,39 @@ entry:
}
define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1811:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1812:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1813:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -12431,11 +52058,40 @@ entry:
}
define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1814:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1815:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1816:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%load = load float, float* %__b
@@ -12449,11 +52105,46 @@ entry:
define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1817:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1818:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1819:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%1 = bitcast <2 x i64> %__b to <4 x float>
@@ -12464,11 +52155,46 @@ entry:
}
define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1820:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1821:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1822:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -12480,11 +52206,47 @@ entry:
}
define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1823:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1824:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1825:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
%load = load float, float* %__b
@@ -12498,21 +52260,23 @@ entry:
define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
@@ -12524,21 +52288,23 @@ entry:
}
define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovaps (%rdi), %ymm1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
@@ -12551,21 +52317,23 @@ entry:
}
define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b:
-; NoVLX: ## BB#0: ## %entry
-; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
@@ -12580,12 +52348,72 @@ entry:
define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1826:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1827:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1828:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
%1 = bitcast <4 x i64> %__b to <8 x float>
@@ -12596,12 +52424,72 @@ entry:
}
define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1829:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1830:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1831:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovaps (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -12613,12 +52501,72 @@ entry:
}
define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1832:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1833:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1834:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
%load = load float, float* %__b
@@ -12632,12 +52580,77 @@ entry:
define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1835:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1836:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1837:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
%1 = bitcast <4 x i64> %__b to <8 x float>
@@ -12648,12 +52661,77 @@ entry:
}
define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1838:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1839:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1840:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vmovaps (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -12665,12 +52743,77 @@ entry:
}
define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1841:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1842:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1843:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
%load = load float, float* %__b
@@ -12684,12 +52827,120 @@ entry:
define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1844:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1845:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1846:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1847:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1848:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1849:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1850:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1851:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
%1 = bitcast <8 x i64> %__b to <16 x float>
@@ -12700,12 +52951,120 @@ entry:
}
define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1852:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1853:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1854:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1855:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1856:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1857:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1858:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1859:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -12717,12 +53076,120 @@ entry:
}
define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1860:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1861:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1862:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: .Lcfi1863:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1864:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1865:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1866:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1867:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
%load = load float, float* %__b
@@ -12736,12 +53203,18 @@ entry:
define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovw %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
%1 = bitcast <8 x i64> %__b to <16 x float>
@@ -12752,12 +53225,125 @@ entry:
define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1868:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1869:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1870:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1871:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1872:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1873:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1874:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1875:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
%1 = bitcast <8 x i64> %__b to <16 x float>
@@ -12768,12 +53354,125 @@ entry:
}
define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1876:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1877:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1878:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1879:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1880:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1881:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1882:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1883:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -12785,12 +53484,125 @@ entry:
}
define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1884:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1885:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1886:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: pushq %r15
+; NoVLX-NEXT: pushq %r14
+; NoVLX-NEXT: pushq %r13
+; NoVLX-NEXT: pushq %r12
+; NoVLX-NEXT: pushq %rbx
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: .Lcfi1887:
+; NoVLX-NEXT: .cfi_offset %rbx, -56
+; NoVLX-NEXT: .Lcfi1888:
+; NoVLX-NEXT: .cfi_offset %r12, -48
+; NoVLX-NEXT: .Lcfi1889:
+; NoVLX-NEXT: .cfi_offset %r13, -40
+; NoVLX-NEXT: .Lcfi1890:
+; NoVLX-NEXT: .cfi_offset %r14, -32
+; NoVLX-NEXT: .Lcfi1891:
+; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: leaq -40(%rbp), %rsp
+; NoVLX-NEXT: popq %rbx
+; NoVLX-NEXT: popq %r12
+; NoVLX-NEXT: popq %r13
+; NoVLX-NEXT: popq %r14
+; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
%load = load float, float* %__b
@@ -12804,13 +53616,20 @@ entry:
define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: movzwl %ax, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: movzwl %ax, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movzwl %ax, %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
%1 = bitcast <8 x i64> %__b to <16 x float>
@@ -12822,12 +53641,23 @@ entry:
declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)
define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%1 = bitcast <2 x i64> %__b to <2 x double>
@@ -12838,12 +53668,23 @@ entry:
}
define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -12855,12 +53696,24 @@ entry:
}
define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%load = load double, double* %__b
@@ -12874,11 +53727,35 @@ entry:
define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%1 = bitcast <2 x i64> %__b to <2 x double>
@@ -12889,11 +53766,35 @@ entry:
}
define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -12905,11 +53806,36 @@ entry:
}
define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%load = load double, double* %__b
@@ -12923,11 +53849,34 @@ entry:
define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%1 = bitcast <2 x i64> %__b to <2 x double>
@@ -12938,11 +53887,34 @@ entry:
}
define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -12954,11 +53926,35 @@ entry:
}
define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%load = load double, double* %__b
@@ -12972,11 +53968,39 @@ entry:
define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1892:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1893:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1894:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%1 = bitcast <2 x i64> %__b to <2 x double>
@@ -12987,11 +54011,39 @@ entry:
}
define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1895:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1896:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1897:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -13003,11 +54055,40 @@ entry:
}
define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1898:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1899:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1900:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%load = load double, double* %__b
@@ -13021,11 +54102,46 @@ entry:
define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1901:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1902:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1903:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%1 = bitcast <2 x i64> %__b to <2 x double>
@@ -13036,11 +54152,46 @@ entry:
}
define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1904:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1905:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1906:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%load = load <2 x i64>, <2 x i64>* %__b
@@ -13052,11 +54203,47 @@ entry:
}
define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1907:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1908:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1909:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
%load = load double, double* %__b
@@ -13070,12 +54257,53 @@ entry:
define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%1 = bitcast <4 x i64> %__b to <4 x double>
@@ -13086,12 +54314,53 @@ entry:
}
define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -13103,12 +54372,54 @@ entry:
}
define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kshiftlw $7, %k0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%load = load double, double* %__b
@@ -13122,12 +54433,52 @@ entry:
define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%1 = bitcast <4 x i64> %__b to <4 x double>
@@ -13138,12 +54489,52 @@ entry:
}
define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -13155,12 +54546,53 @@ entry:
}
define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k1
+; NoVLX-NEXT: kshiftlw $1, %k1, %k1
+; NoVLX-NEXT: korw %k0, %k1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%load = load double, double* %__b
@@ -13174,12 +54606,41 @@ entry:
define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1910:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1911:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1912:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%1 = bitcast <4 x i64> %__b to <4 x double>
@@ -13190,12 +54651,41 @@ entry:
}
define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1913:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1914:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1915:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -13207,12 +54697,42 @@ entry:
}
define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1916:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1917:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1918:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%load = load double, double* %__b
@@ -13226,12 +54746,48 @@ entry:
define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1919:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1920:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1921:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%1 = bitcast <4 x i64> %__b to <4 x double>
@@ -13242,12 +54798,48 @@ entry:
}
define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1922:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1923:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1924:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%load = load <4 x i64>, <4 x i64>* %__b
@@ -13259,12 +54851,49 @@ entry:
}
define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1925:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1926:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1927:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
%load = load double, double* %__b
@@ -13278,12 +54907,20 @@ entry:
define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>
@@ -13294,12 +54931,20 @@ entry:
}
define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -13311,12 +54956,20 @@ entry:
}
define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%load = load double, double* %__b
@@ -13330,12 +54983,22 @@ entry:
define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: movzbl %al, %eax
+; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movzbl %al, %eax
+; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>
@@ -13346,12 +55009,70 @@ entry:
define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1928:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1929:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1930:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>
@@ -13362,12 +55083,70 @@ entry:
}
define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1931:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1932:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1933:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -13379,12 +55158,70 @@ entry:
}
define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1934:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1935:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1936:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%load = load double, double* %__b
@@ -13398,12 +55235,19 @@ entry:
define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovb %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movzbl %al, %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>
@@ -13414,12 +55258,75 @@ entry:
define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1937:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1938:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1939:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>
@@ -13430,12 +55337,75 @@ entry:
}
define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1940:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1941:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1942:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%load = load <8 x i64>, <8 x i64>* %__b
@@ -13447,12 +55417,75 @@ entry:
}
define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, %rax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .Lcfi1943:
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .Lcfi1944:
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .Lcfi1945:
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $15, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%load = load double, double* %__b
@@ -13466,13 +55499,20 @@ entry:
define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_sae_mask:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_sae_mask:
+; VLX: # BB#0: # %entry
+; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: movzbl %al, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_sae_mask:
+; NoVLX: # BB#0: # %entry
+; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movzbl %al, %eax
+; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
%1 = bitcast <8 x i64> %__b to <8 x double>
diff --git a/test/CodeGen/X86/bitcast-and-setcc-128.ll b/test/CodeGen/X86/bitcast-and-setcc-128.ll
index 092b139fca2f..1d78ee26a0b9 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-128.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-128.ll
@@ -1,48 +1,48 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+ssse3 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=AVX12,AVX1
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefixes=AVX12,AVX2
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefixes=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512
define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) {
; SSE2-LABEL: v8i16:
-; SSE2: ## BB#0:
+; SSE2: # BB#0:
; SSE2-NEXT: pcmpgtw %xmm1, %xmm0
; SSE2-NEXT: pcmpgtw %xmm3, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: packuswb %xmm2, %xmm2
; SSE2-NEXT: pmovmskb %xmm2, %eax
-; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i16:
-; SSSE3: ## BB#0:
+; SSSE3: # BB#0:
; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
; SSSE3-NEXT: pand %xmm0, %xmm2
; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: pmovmskb %xmm2, %eax
-; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSSE3-NEXT: retq
;
; AVX12-LABEL: v8i16:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v8i16:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %k1
; AVX512-NEXT: vpcmpgtw %xmm3, %xmm2, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512-NEXT: retq
%x0 = icmp sgt <8 x i16> %a, %b
%x1 = icmp sgt <8 x i16> %c, %d
@@ -53,25 +53,25 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) {
define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
; SSE2-SSSE3-LABEL: v4i32:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i32:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4i32:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
; AVX512-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
@@ -87,25 +87,25 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) {
; SSE2-SSSE3-LABEL: v4f32:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm1
; SSE2-SSSE3-NEXT: cmpltps %xmm2, %xmm3
; SSE2-SSSE3-NEXT: andps %xmm1, %xmm3
; SSE2-SSSE3-NEXT: movmskps %xmm3, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4f32:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
; AVX12-NEXT: vcmpltps %xmm2, %xmm3, %xmm1
; AVX12-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4f32:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vcmpltps %xmm0, %xmm1, %k1
; AVX512-NEXT: vcmpltps %xmm2, %xmm3, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
@@ -121,29 +121,29 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d)
define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
; SSE2-SSSE3-LABEL: v16i8:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v16i8:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v16i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %k1
; AVX512-NEXT: vpcmpgtb %xmm3, %xmm2, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; AVX512-NEXT: retq
%x0 = icmp sgt <16 x i8> %a, %b
%x1 = icmp sgt <16 x i8> %c, %d
@@ -154,7 +154,7 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
; SSE2-SSSE3-LABEL: v2i8:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: psllq $56, %xmm2
; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4
; SSE2-SSSE3-NEXT: psrad $31, %xmm4
@@ -206,11 +206,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i8:
-; AVX1: ## BB#0:
+; AVX1: # BB#0:
; AVX1-NEXT: vpsllq $56, %xmm3, %xmm3
; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4
; AVX1-NEXT: vpsrad $24, %xmm3, %xmm3
@@ -235,11 +235,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i8:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpsllq $56, %xmm3, %xmm3
; AVX2-NEXT: vpsrad $31, %xmm3, %xmm4
; AVX2-NEXT: vpsrad $24, %xmm3, %xmm3
@@ -264,11 +264,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpsllq $56, %xmm3, %xmm3
; AVX512-NEXT: vpsraq $56, %xmm3, %xmm3
; AVX512-NEXT: vpsllq $56, %xmm2, %xmm2
@@ -292,7 +292,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
; SSE2-SSSE3-LABEL: v2i16:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: psllq $48, %xmm2
; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4
; SSE2-SSSE3-NEXT: psrad $31, %xmm4
@@ -344,11 +344,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i16:
-; AVX1: ## BB#0:
+; AVX1: # BB#0:
; AVX1-NEXT: vpsllq $48, %xmm3, %xmm3
; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4
; AVX1-NEXT: vpsrad $16, %xmm3, %xmm3
@@ -373,11 +373,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i16:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpsllq $48, %xmm3, %xmm3
; AVX2-NEXT: vpsrad $31, %xmm3, %xmm4
; AVX2-NEXT: vpsrad $16, %xmm3, %xmm3
@@ -402,11 +402,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i16:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpsllq $48, %xmm3, %xmm3
; AVX512-NEXT: vpsraq $48, %xmm3, %xmm3
; AVX512-NEXT: vpsllq $48, %xmm2, %xmm2
@@ -430,7 +430,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
; SSE2-SSSE3-LABEL: v2i32:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: psllq $32, %xmm2
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
; SSE2-SSSE3-NEXT: psrad $31, %xmm2
@@ -474,11 +474,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i32:
-; AVX1: ## BB#0:
+; AVX1: # BB#0:
; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
@@ -499,11 +499,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i32:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpsllq $32, %xmm3, %xmm3
; AVX2-NEXT: vpsrad $31, %xmm3, %xmm4
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
@@ -524,11 +524,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i32:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpsllq $32, %xmm3, %xmm3
; AVX512-NEXT: vpsraq $32, %xmm3, %xmm3
; AVX512-NEXT: vpsllq $32, %xmm2, %xmm2
@@ -552,7 +552,7 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
; SSE2-SSSE3-LABEL: v2i64:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm1
; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0
@@ -576,20 +576,20 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v2i64:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskpd %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v2i64:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
; AVX512-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
@@ -605,25 +605,25 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %d) {
; SSE2-SSSE3-LABEL: v2f64:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm1
; SSE2-SSSE3-NEXT: cmpltpd %xmm2, %xmm3
; SSE2-SSSE3-NEXT: andpd %xmm1, %xmm3
; SSE2-SSSE3-NEXT: movmskpd %xmm3, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v2f64:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
; AVX12-NEXT: vcmpltpd %xmm2, %xmm3, %xmm1
; AVX12-NEXT: vandpd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskpd %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v2f64:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %k1
; AVX512-NEXT: vcmpltpd %xmm2, %xmm3, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
@@ -639,7 +639,7 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double>
define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
; SSE2-SSSE3-LABEL: v4i8:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: pslld $24, %xmm3
; SSE2-SSSE3-NEXT: psrad $24, %xmm3
; SSE2-SSSE3-NEXT: pslld $24, %xmm2
@@ -652,11 +652,11 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i8:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpslld $24, %xmm3, %xmm3
; AVX12-NEXT: vpsrad $24, %xmm3, %xmm3
; AVX12-NEXT: vpslld $24, %xmm2, %xmm2
@@ -669,11 +669,11 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpslld $24, %xmm3, %xmm3
; AVX512-NEXT: vpsrad $24, %xmm3, %xmm3
; AVX512-NEXT: vpslld $24, %xmm2, %xmm2
@@ -697,7 +697,7 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; SSE2-SSSE3-LABEL: v4i16:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: pslld $16, %xmm3
; SSE2-SSSE3-NEXT: psrad $16, %xmm3
; SSE2-SSSE3-NEXT: pslld $16, %xmm2
@@ -710,11 +710,11 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i16:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpslld $16, %xmm3, %xmm3
; AVX12-NEXT: vpsrad $16, %xmm3, %xmm3
; AVX12-NEXT: vpslld $16, %xmm2, %xmm2
@@ -727,11 +727,11 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4i16:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpslld $16, %xmm3, %xmm3
; AVX512-NEXT: vpsrad $16, %xmm3, %xmm3
; AVX512-NEXT: vpslld $16, %xmm2, %xmm2
@@ -755,7 +755,7 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
; SSE2-LABEL: v8i8:
-; SSE2: ## BB#0:
+; SSE2: # BB#0:
; SSE2-NEXT: psllw $8, %xmm3
; SSE2-NEXT: psraw $8, %xmm3
; SSE2-NEXT: psllw $8, %xmm2
@@ -770,11 +770,11 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i8:
-; SSSE3: ## BB#0:
+; SSSE3: # BB#0:
; SSSE3-NEXT: psllw $8, %xmm3
; SSSE3-NEXT: psraw $8, %xmm3
; SSSE3-NEXT: psllw $8, %xmm2
@@ -788,11 +788,11 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
; SSSE3-NEXT: pand %xmm2, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSSE3-NEXT: retq
;
; AVX12-LABEL: v8i8:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpsllw $8, %xmm3, %xmm3
; AVX12-NEXT: vpsraw $8, %xmm3, %xmm3
; AVX12-NEXT: vpsllw $8, %xmm2, %xmm2
@@ -806,11 +806,11 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v8i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpsllw $8, %xmm3, %xmm3
; AVX512-NEXT: vpsraw $8, %xmm3, %xmm3
; AVX512-NEXT: vpsllw $8, %xmm2, %xmm2
@@ -822,7 +822,7 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %k1
; AVX512-NEXT: vpcmpgtw %xmm3, %xmm2, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512-NEXT: retq
%x0 = icmp sgt <8 x i8> %a, %b
%x1 = icmp sgt <8 x i8> %c, %d
diff --git a/test/CodeGen/X86/bitcast-and-setcc-256.ll b/test/CodeGen/X86/bitcast-and-setcc-256.ll
index a6d6ca155302..95529686a58a 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-256.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-256.ll
@@ -1,13 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+SSE2 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+SSSE3 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=AVX12,AVX1
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefixes=AVX12,AVX2
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+SSE2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+SSSE3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefix=AVX512
define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
; SSE2-SSSE3-LABEL: v4i64:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm3
; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm1
@@ -57,11 +57,11 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
; SSE2-SSSE3-NEXT: psrad $31, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v4i64:
-; AVX1: ## BB#0:
+; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
@@ -74,12 +74,12 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovmskps %xmm0, %eax
-; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v4i64:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
@@ -88,12 +88,12 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovmskps %xmm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: v4i64:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
; AVX512-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
@@ -110,7 +110,7 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
; SSE2-SSSE3-LABEL: v4f64:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: cmpltpd %xmm1, %xmm3
; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm2
; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
@@ -123,11 +123,11 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double>
; SSE2-SSSE3-NEXT: psrad $31, %xmm6
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm6
; SSE2-SSSE3-NEXT: movmskps %xmm6, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4f64:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX12-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
@@ -136,12 +136,12 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double>
; AVX12-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: vzeroupper
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4f64:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %k1
; AVX512-NEXT: vcmpltpd %ymm2, %ymm3, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
@@ -158,7 +158,7 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double>
define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
; SSE2-LABEL: v16i16:
-; SSE2: ## BB#0:
+; SSE2: # BB#0:
; SSE2-NEXT: pcmpgtw %xmm3, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm3, %xmm1
@@ -181,11 +181,11 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
; SSE2-NEXT: pcmpgtb %xmm4, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pmovmskb %xmm2, %eax
-; SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v16i16:
-; SSSE3: ## BB#0:
+; SSSE3: # BB#0:
; SSSE3-NEXT: pcmpgtw %xmm3, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; SSSE3-NEXT: pshufb %xmm3, %xmm1
@@ -208,11 +208,11 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
; SSSE3-NEXT: pcmpgtb %xmm4, %xmm2
; SSSE3-NEXT: pand %xmm1, %xmm2
; SSSE3-NEXT: pmovmskb %xmm2, %eax
-; SSSE3-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SSSE3-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; SSSE3-NEXT: retq
;
; AVX1-LABEL: v16i16:
-; AVX1: ## BB#0:
+; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpcmpgtw %xmm4, %xmm5, %xmm4
@@ -225,12 +225,12 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v16i16:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
@@ -239,16 +239,16 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: v16i16:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %k1
; AVX512-NEXT: vpcmpgtw %ymm3, %ymm2, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%x0 = icmp sgt <16 x i16> %a, %b
@@ -260,7 +260,7 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
; SSE2-LABEL: v8i32:
-; SSE2: ## BB#0:
+; SSE2: # BB#0:
; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
@@ -287,11 +287,11 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: packuswb %xmm2, %xmm2
; SSE2-NEXT: pmovmskb %xmm2, %eax
-; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i32:
-; SSSE3: ## BB#0:
+; SSSE3: # BB#0:
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSSE3-NEXT: pshufb %xmm3, %xmm1
@@ -310,11 +310,11 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
; SSSE3-NEXT: pand %xmm0, %xmm4
; SSSE3-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: pmovmskb %xmm4, %eax
-; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSSE3-NEXT: retq
;
; AVX1-LABEL: v8i32:
-; AVX1: ## BB#0:
+; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
@@ -328,12 +328,12 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v8i32:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
@@ -343,16 +343,16 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: v8i32:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
; AVX512-NEXT: vpcmpgtd %ymm3, %ymm2, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%x0 = icmp sgt <8 x i32> %a, %b
@@ -364,7 +364,7 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) {
; SSE2-LABEL: v8f32:
-; SSE2: ## BB#0:
+; SSE2: # BB#0:
; SSE2-NEXT: cmpltps %xmm1, %xmm3
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
@@ -391,11 +391,11 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d)
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: packuswb %xmm2, %xmm2
; SSE2-NEXT: pmovmskb %xmm2, %eax
-; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8f32:
-; SSSE3: ## BB#0:
+; SSSE3: # BB#0:
; SSSE3-NEXT: cmpltps %xmm1, %xmm3
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSSE3-NEXT: pshufb %xmm1, %xmm3
@@ -414,11 +414,11 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d)
; SSSE3-NEXT: pand %xmm2, %xmm6
; SSSE3-NEXT: pshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: pmovmskb %xmm6, %eax
-; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSSE3-NEXT: retq
;
; AVX12-LABEL: v8f32:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX12-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
@@ -428,16 +428,16 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d)
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: vzeroupper
; AVX12-NEXT: retq
;
; AVX512-LABEL: v8f32:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %k1
; AVX512-NEXT: vcmpltps %ymm2, %ymm3, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%x0 = fcmp ogt <8 x float> %a, %b
@@ -449,7 +449,7 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d)
define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
; SSE2-SSSE3-LABEL: v32i8:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: pcmpgtb %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm1
; SSE2-SSSE3-NEXT: pcmpgtb %xmm6, %xmm4
@@ -561,14 +561,14 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v32i8:
-; AVX1: ## BB#0:
+; AVX1: # BB#0:
; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: Lcfi0:
+; AVX1-NEXT: .Lcfi0:
; AVX1-NEXT: .cfi_def_cfa_offset 16
-; AVX1-NEXT: Lcfi1:
+; AVX1-NEXT: .Lcfi1:
; AVX1-NEXT: .cfi_offset %rbp, -16
; AVX1-NEXT: movq %rsp, %rbp
-; AVX1-NEXT: Lcfi2:
+; AVX1-NEXT: .Lcfi2:
; AVX1-NEXT: .cfi_def_cfa_register %rbp
; AVX1-NEXT: andq $-32, %rsp
; AVX1-NEXT: subq $32, %rsp
@@ -687,7 +687,7 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: v32i8:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -696,7 +696,7 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: v32i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %k1
; AVX512-NEXT: vpcmpgtb %ymm3, %ymm2, %k0 {%k1}
; AVX512-NEXT: kmovd %k0, %eax
diff --git a/test/CodeGen/X86/bitcast-and-setcc-512.ll b/test/CodeGen/X86/bitcast-and-setcc-512.ll
new file mode 100644
index 000000000000..2eba79b0297f
--- /dev/null
+++ b/test/CodeGen/X86/bitcast-and-setcc-512.ll
@@ -0,0 +1,1868 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
+
+define i8 @v8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i64> %d) {
+; SSE-LABEL: v8i64:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: pcmpgtq %xmm7, %xmm3
+; SSE-NEXT: pcmpgtq %xmm6, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT: pslld $31, %xmm2
+; SSE-NEXT: psrad $31, %xmm2
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE-NEXT: pshufb %xmm3, %xmm2
+; SSE-NEXT: pcmpgtq %xmm5, %xmm1
+; SSE-NEXT: pcmpgtq %xmm4, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: pslld $31, %xmm0
+; SSE-NEXT: psrad $31, %xmm0
+; SSE-NEXT: pshufb %xmm3, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE-NEXT: psllw $15, %xmm0
+; SSE-NEXT: psraw $15, %xmm0
+; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm11[0,2]
+; SSE-NEXT: pslld $31, %xmm9
+; SSE-NEXT: psrad $31, %xmm9
+; SSE-NEXT: pshufb %xmm3, %xmm9
+; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm10[0,2]
+; SSE-NEXT: pslld $31, %xmm8
+; SSE-NEXT: psrad $31, %xmm8
+; SSE-NEXT: pshufb %xmm3, %xmm8
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm9[0]
+; SSE-NEXT: psllw $15, %xmm8
+; SSE-NEXT: psraw $15, %xmm8
+; SSE-NEXT: pand %xmm0, %xmm8
+; SSE-NEXT: pshufb {{.*#+}} xmm8 = xmm8[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE-NEXT: pmovmskb %xmm8, %eax
+; SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v8i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
+; AVX1-NEXT: vpcmpgtq %xmm8, %xmm9, %xmm8
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpacksswb %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm9
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm9[0]
+; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm2
+; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm3
+; AVX1-NEXT: vpacksswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vpsllw $15, %xmm1, %xmm1
+; AVX1-NEXT: vpsraw $15, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v8i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtq %ymm7, %ymm5, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %ymm6, %ymm4, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vpacksswb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vpsllw $15, %xmm1, %xmm1
+; AVX2-NEXT: vpsraw $15, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v8i64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
+; AVX512F-NEXT: vpcmpgtq %zmm3, %zmm2, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v8i64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
+; AVX512BW-NEXT: vpcmpgtq %zmm3, %zmm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x0 = icmp sgt <8 x i64> %a, %b
+ %x1 = icmp sgt <8 x i64> %c, %d
+ %y = and <8 x i1> %x0, %x1
+ %res = bitcast <8 x i1> %y to i8
+ ret i8 %res
+}
+
+define i8 @v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d) {
+; SSE-LABEL: v8f64:
+; SSE: # BB#0:
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: cmpltpd %xmm3, %xmm7
+; SSE-NEXT: cmpltpd %xmm2, %xmm6
+; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
+; SSE-NEXT: pslld $31, %xmm6
+; SSE-NEXT: psrad $31, %xmm6
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE-NEXT: pshufb %xmm2, %xmm6
+; SSE-NEXT: cmpltpd %xmm1, %xmm5
+; SSE-NEXT: cmpltpd %xmm0, %xmm4
+; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
+; SSE-NEXT: pslld $31, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: pshufb %xmm2, %xmm4
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; SSE-NEXT: psllw $15, %xmm4
+; SSE-NEXT: psraw $15, %xmm4
+; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm11[0,2]
+; SSE-NEXT: pslld $31, %xmm9
+; SSE-NEXT: psrad $31, %xmm9
+; SSE-NEXT: pshufb %xmm2, %xmm9
+; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm10[0,2]
+; SSE-NEXT: pslld $31, %xmm8
+; SSE-NEXT: psrad $31, %xmm8
+; SSE-NEXT: pshufb %xmm2, %xmm8
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm9[0]
+; SSE-NEXT: psllw $15, %xmm8
+; SSE-NEXT: psraw $15, %xmm8
+; SSE-NEXT: pand %xmm4, %xmm8
+; SSE-NEXT: pshufb {{.*#+}} xmm8 = xmm8[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE-NEXT: pmovmskb %xmm8, %eax
+; SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX12-LABEL: v8f64:
+; AVX12: # BB#0:
+; AVX12-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1
+; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX12-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX12-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX12-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX12-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0
+; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX12-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX12-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX12-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX12-NEXT: vpsraw $15, %xmm0, %xmm0
+; AVX12-NEXT: vcmpltpd %ymm5, %ymm7, %ymm1
+; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX12-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX12-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX12-NEXT: vcmpltpd %ymm4, %ymm6, %ymm2
+; AVX12-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX12-NEXT: vpacksswb %xmm4, %xmm2, %xmm2
+; AVX12-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX12-NEXT: vpsllw $15, %xmm1, %xmm1
+; AVX12-NEXT: vpsraw $15, %xmm1, %xmm1
+; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX12-NEXT: vpmovmskb %xmm0, %eax
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: vzeroupper
+; AVX12-NEXT: retq
+;
+; AVX512F-LABEL: v8f64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k1
+; AVX512F-NEXT: vcmpltpd %zmm2, %zmm3, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v8f64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k1
+; AVX512BW-NEXT: vcmpltpd %zmm2, %zmm3, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x0 = fcmp ogt <8 x double> %a, %b
+ %x1 = fcmp ogt <8 x double> %c, %d
+ %y = and <8 x i1> %x0, %x1
+ %res = bitcast <8 x i1> %y to i8
+ ret i8 %res
+}
+
+define i32 @v32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) {
+; SSE-LABEL: v32i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: pcmpgtw %xmm5, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE-NEXT: pshufb %xmm5, %xmm1
+; SSE-NEXT: pcmpgtw %xmm4, %xmm0
+; SSE-NEXT: pshufb %xmm5, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: pcmpgtw %xmm7, %xmm3
+; SSE-NEXT: pshufb %xmm5, %xmm3
+; SSE-NEXT: pcmpgtw %xmm6, %xmm2
+; SSE-NEXT: pshufb %xmm5, %xmm2
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: pshufb %xmm5, %xmm11
+; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: pshufb %xmm5, %xmm8
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm11[0]
+; SSE-NEXT: pand %xmm0, %xmm8
+; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: pshufb %xmm5, %xmm10
+; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: pshufb %xmm5, %xmm9
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0]
+; SSE-NEXT: pand %xmm2, %xmm9
+; SSE-NEXT: pextrb $15, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $14, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $13, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $11, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $9, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $7, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $5, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $3, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $1, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $15, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $14, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $13, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $11, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $9, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $7, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $5, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $3, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $1, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; SSE-NEXT: shll $16, %ecx
+; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: orl %ecx, %eax
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v32i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi0:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi1:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi2:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $32, %rsp
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
+; AVX1-NEXT: vpcmpgtw %xmm8, %xmm9, %xmm8
+; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpacksswb %xmm8, %xmm1, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2
+; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpgtw %xmm7, %xmm5, %xmm2
+; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtw %xmm6, %xmm4, %xmm3
+; AVX1-NEXT: vpacksswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrb $15, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: movl (%rsp), %eax
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v32i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpgtw %ymm7, %ymm5, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtw %ymm6, %ymm4, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: .Lcfi0:
+; AVX512F-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-NEXT: .Lcfi1:
+; AVX512F-NEXT: .cfi_offset %rbp, -16
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: .Lcfi2:
+; AVX512F-NEXT: .cfi_def_cfa_register %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $32, %rsp
+; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %ecx
+; AVX512F-NEXT: vmovd %ecx, %xmm1
+; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %ecx
+; AVX512F-NEXT: vmovd %ecx, %xmm0
+; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpcmpgtw %ymm7, %ymm5, %ymm1
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %ecx
+; AVX512F-NEXT: vmovd %ecx, %xmm1
+; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: vpcmpgtw %ymm6, %ymm4, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %ecx
+; AVX512F-NEXT: vmovd %ecx, %xmm2
+; AVX512F-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, (%rsp)
+; AVX512F-NEXT: movl (%rsp), %eax
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
+; AVX512BW-NEXT: vpcmpgtw %zmm3, %zmm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x0 = icmp sgt <32 x i16> %a, %b
+ %x1 = icmp sgt <32 x i16> %c, %d
+ %y = and <32 x i1> %x0, %x1
+ %res = bitcast <32 x i1> %y to i32
+ ret i32 %res
+}
+
+define i16 @v16i32(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i32> %d) {
+; SSE-LABEL: v16i32:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: pcmpgtd %xmm7, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE-NEXT: pshufb %xmm7, %xmm3
+; SSE-NEXT: pcmpgtd %xmm6, %xmm2
+; SSE-NEXT: pshufb %xmm7, %xmm2
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE-NEXT: psllw $15, %xmm2
+; SSE-NEXT: psraw $15, %xmm2
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE-NEXT: pshufb %xmm3, %xmm2
+; SSE-NEXT: pcmpgtd %xmm5, %xmm1
+; SSE-NEXT: pshufb %xmm7, %xmm1
+; SSE-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE-NEXT: pshufb %xmm7, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: psllw $15, %xmm0
+; SSE-NEXT: psraw $15, %xmm0
+; SSE-NEXT: pshufb %xmm3, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE-NEXT: psllw $7, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pxor %xmm4, %xmm4
+; SSE-NEXT: pcmpgtb %xmm0, %xmm4
+; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: pshufb %xmm7, %xmm11
+; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: pshufb %xmm7, %xmm9
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
+; SSE-NEXT: psllw $15, %xmm9
+; SSE-NEXT: psraw $15, %xmm9
+; SSE-NEXT: pshufb %xmm3, %xmm9
+; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: pshufb %xmm7, %xmm10
+; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: pshufb %xmm7, %xmm8
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0]
+; SSE-NEXT: psllw $15, %xmm8
+; SSE-NEXT: psraw $15, %xmm8
+; SSE-NEXT: pshufb %xmm3, %xmm8
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm9[0]
+; SSE-NEXT: psllw $7, %xmm8
+; SSE-NEXT: pand %xmm2, %xmm8
+; SSE-NEXT: pcmpgtb %xmm8, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pmovmskb %xmm1, %eax
+; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v16i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
+; AVX1-NEXT: vpcmpgtd %xmm8, %xmm9, %xmm8
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpacksswb %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm9
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm9[0]
+; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT: vpand %xmm9, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm7, %xmm5, %xmm3
+; AVX1-NEXT: vpacksswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm9, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v16i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpcmpgtd %ymm7, %ymm5, %ymm5
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7
+; AVX2-NEXT: vpacksswb %xmm7, %xmm5, %xmm5
+; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5
+; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm4
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
+; AVX2-NEXT: vpacksswb %xmm6, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; AVX2-NEXT: vpsllw $7, %xmm3, %xmm3
+; AVX2-NEXT: vpand %xmm1, %xmm3, %xmm1
+; AVX2-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v16i32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; AVX512F-NEXT: vpcmpgtd %zmm3, %zmm2, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v16i32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; AVX512BW-NEXT: vpcmpgtd %zmm3, %zmm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x0 = icmp sgt <16 x i32> %a, %b
+ %x1 = icmp sgt <16 x i32> %c, %d
+ %y = and <16 x i1> %x0, %x1
+ %res = bitcast <16 x i1> %y to i16
+ ret i16 %res
+}
+
+define i16 @v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x float> %d) {
+; SSE-LABEL: v16f32:
+; SSE: # BB#0:
+; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: cmpltps %xmm3, %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE-NEXT: pshufb %xmm3, %xmm7
+; SSE-NEXT: cmpltps %xmm2, %xmm6
+; SSE-NEXT: pshufb %xmm3, %xmm6
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; SSE-NEXT: psllw $15, %xmm6
+; SSE-NEXT: psraw $15, %xmm6
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE-NEXT: pshufb %xmm2, %xmm6
+; SSE-NEXT: cmpltps %xmm1, %xmm5
+; SSE-NEXT: pshufb %xmm3, %xmm5
+; SSE-NEXT: cmpltps %xmm0, %xmm4
+; SSE-NEXT: pshufb %xmm3, %xmm4
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-NEXT: psllw $15, %xmm4
+; SSE-NEXT: psraw $15, %xmm4
+; SSE-NEXT: pshufb %xmm2, %xmm4
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; SSE-NEXT: psllw $7, %xmm4
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE-NEXT: pand %xmm1, %xmm4
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: pxor %xmm5, %xmm5
+; SSE-NEXT: pcmpgtb %xmm4, %xmm5
+; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: pshufb %xmm3, %xmm11
+; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: pshufb %xmm3, %xmm9
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
+; SSE-NEXT: psllw $15, %xmm9
+; SSE-NEXT: psraw $15, %xmm9
+; SSE-NEXT: pshufb %xmm2, %xmm9
+; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: pshufb %xmm3, %xmm10
+; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: pshufb %xmm3, %xmm8
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0]
+; SSE-NEXT: psllw $15, %xmm8
+; SSE-NEXT: psraw $15, %xmm8
+; SSE-NEXT: pshufb %xmm2, %xmm8
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm9[0]
+; SSE-NEXT: psllw $7, %xmm8
+; SSE-NEXT: pand %xmm1, %xmm8
+; SSE-NEXT: pcmpgtb %xmm8, %xmm0
+; SSE-NEXT: pand %xmm5, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX12-LABEL: v16f32:
+; AVX12: # BB#0:
+; AVX12-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
+; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX12-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX12-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX12-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX12-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
+; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX12-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX12-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX12-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX12-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX12-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX12-NEXT: vcmpltps %ymm5, %ymm7, %ymm5
+; AVX12-NEXT: vextractf128 $1, %ymm5, %xmm7
+; AVX12-NEXT: vpacksswb %xmm7, %xmm5, %xmm5
+; AVX12-NEXT: vpshufb %xmm3, %xmm5, %xmm5
+; AVX12-NEXT: vcmpltps %ymm4, %ymm6, %ymm4
+; AVX12-NEXT: vextractf128 $1, %ymm4, %xmm6
+; AVX12-NEXT: vpacksswb %xmm6, %xmm4, %xmm4
+; AVX12-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; AVX12-NEXT: vpsllw $7, %xmm3, %xmm3
+; AVX12-NEXT: vpand %xmm1, %xmm3, %xmm1
+; AVX12-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
+; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpmovmskb %xmm0, %eax
+; AVX12-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX12-NEXT: vzeroupper
+; AVX12-NEXT: retq
+;
+; AVX512F-LABEL: v16f32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k1
+; AVX512F-NEXT: vcmpltps %zmm2, %zmm3, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v16f32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vcmpltps %zmm0, %zmm1, %k1
+; AVX512BW-NEXT: vcmpltps %zmm2, %zmm3, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x0 = fcmp ogt <16 x float> %a, %b
+ %x1 = fcmp ogt <16 x float> %c, %d
+ %y = and <16 x i1> %x0, %x1
+ %res = bitcast <16 x i1> %y to i16
+ ret i16 %res
+}
+
+define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
+; SSE-LABEL: v64i8:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: pcmpgtb %xmm6, %xmm2
+; SSE-NEXT: pcmpgtb %xmm7, %xmm3
+; SSE-NEXT: pcmpgtb %xmm4, %xmm0
+; SSE-NEXT: pcmpgtb %xmm5, %xmm1
+; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: pand %xmm2, %xmm8
+; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: pand %xmm3, %xmm9
+; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: pand %xmm0, %xmm10
+; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: pand %xmm1, %xmm11
+; SSE-NEXT: pextrb $15, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $14, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $13, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $11, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $9, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $7, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $5, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $3, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $1, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm11, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $15, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $14, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $13, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $11, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $9, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $7, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $5, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $3, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $1, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm10, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $15, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $14, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $13, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $11, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $9, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $7, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $5, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $3, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $1, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm9, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $15, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $14, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $13, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $11, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $9, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $7, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $5, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $3, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $1, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm8, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx
+; SSE-NEXT: orl %eax, %ecx
+; SSE-NEXT: movl -{{[0-9]+}}(%rsp), %edx
+; SSE-NEXT: shll $16, %edx
+; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: orl %edx, %eax
+; SSE-NEXT: shlq $32, %rax
+; SSE-NEXT: orq %rcx, %rax
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v64i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi3:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi4:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi5:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
+; AVX1-NEXT: vpcmpgtb %xmm8, %xmm9, %xmm8
+; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm8
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpcmpgtb %xmm7, %xmm5, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vandps %ymm0, %ymm8, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm6, %xmm4, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $15, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $15, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrb $15, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: movl (%rsp), %ecx
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v64i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Lcfi0:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .Lcfi1:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .Lcfi2:
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpcmpgtb %ymm7, %ymm5, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpcmpgtb %ymm6, %ymm4, %ymm1
+; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $15, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $14, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $13, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $12, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $11, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $10, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $9, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $8, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $7, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $6, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $5, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $4, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $3, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $2, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $1, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $0, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $15, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $14, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $13, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $12, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $11, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $10, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $9, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $8, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $7, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $6, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $5, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $4, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $3, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $2, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $1, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $0, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrb $15, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $14, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $13, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $12, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $11, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $10, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $9, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $8, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $7, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $6, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $5, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $4, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $3, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $2, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $1, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $0, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl (%rsp), %ecx
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: .Lcfi3:
+; AVX512F-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-NEXT: .Lcfi4:
+; AVX512F-NEXT: .cfi_offset %rbp, -16
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: .Lcfi5:
+; AVX512F-NEXT: .cfi_def_cfa_register %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpcmpgtb %ymm7, %ymm5, %ymm2
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpcmpgtb %ymm6, %ymm4, %ymm2
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
+; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, (%rsp)
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl (%rsp), %ecx
+; AVX512F-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX512F-NEXT: shlq $32, %rax
+; AVX512F-NEXT: orq %rcx, %rax
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
+; AVX512BW-NEXT: vpcmpgtb %zmm3, %zmm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x0 = icmp sgt <64 x i8> %a, %b
+ %x1 = icmp sgt <64 x i8> %c, %d
+ %y = and <64 x i1> %x0, %x1
+ %res = bitcast <64 x i1> %y to i64
+ ret i64 %res
+}
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
new file mode 100644
index 000000000000..9b6401d1a76c
--- /dev/null
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -0,0 +1,3483 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512
+
+;
+; 128-bit vectors
+;
+
+define <2 x i64> @ext_i2_2i64(i2 %a0) {
+; SSE2-SSSE3-LABEL: ext_i2_2i64:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: andb $3, %dil
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shlq $62, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movq %rcx, %xmm1
+; SSE2-SSSE3-NEXT: shlq $63, %rax
+; SSE2-SSSE3-NEXT: sarq $63, %rax
+; SSE2-SSSE3-NEXT: movq %rax, %xmm0
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: ext_i2_2i64:
+; AVX12: # BB#0:
+; AVX12-NEXT: andb $3, %dil
+; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $62, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vmovq %rcx, %xmm0
+; AVX12-NEXT: shlq $63, %rax
+; AVX12-NEXT: sarq $63, %rax
+; AVX12-NEXT: vmovq %rax, %xmm1
+; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: ext_i2_2i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: andb $3, %dil
+; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %1 = bitcast i2 %a0 to <2 x i1>
+ %2 = sext <2 x i1> %1 to <2 x i64>
+ ret <2 x i64> %2
+}
+
+define <4 x i32> @ext_i4_4i32(i4 %a0) {
+; SSE2-SSSE3-LABEL: ext_i4_4i32:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: andb $15, %dil
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shlq $60, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shlq $61, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shlq $62, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: shlq $63, %rax
+; SSE2-SSSE3-NEXT: sarq $63, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: ext_i4_4i32:
+; AVX12: # BB#0:
+; AVX12-NEXT: andb $15, %dil
+; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $62, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: movq %rax, %rdx
+; AVX12-NEXT: shlq $63, %rdx
+; AVX12-NEXT: sarq $63, %rdx
+; AVX12-NEXT: vmovd %edx, %xmm0
+; AVX12-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $61, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: shlq $60, %rax
+; AVX12-NEXT: sarq $63, %rax
+; AVX12-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: ext_i4_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: andb $15, %dil
+; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %1 = bitcast i4 %a0 to <4 x i1>
+ %2 = sext <4 x i1> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @ext_i8_8i16(i8 %a0) {
+; SSE2-SSSE3-LABEL: ext_i8_8i16:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movsbq -{{[0-9]+}}(%rsp), %rax
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shrq $7, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shlq $57, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shlq $58, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shlq $59, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shlq $60, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shlq $61, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: shlq $62, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: shlq $63, %rax
+; SSE2-SSSE3-NEXT: sarq $63, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: ext_i8_8i16:
+; AVX12: # BB#0:
+; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT: movsbq -{{[0-9]+}}(%rsp), %rax
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $62, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: movq %rax, %rdx
+; AVX12-NEXT: shlq $63, %rdx
+; AVX12-NEXT: sarq $63, %rdx
+; AVX12-NEXT: vmovd %edx, %xmm0
+; AVX12-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $61, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $60, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $59, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $58, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $57, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: shrq $7, %rax
+; AVX12-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: ext_i8_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k0
+; AVX512-NEXT: vpmovm2w %k0, %xmm0
+; AVX512-NEXT: retq
+ %1 = bitcast i8 %a0 to <8 x i1>
+ %2 = sext <8 x i1> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @ext_i16_16i8(i16 %a0) {
+; SSE2-SSSE3-LABEL: ext_i16_16i8:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: pushq %rbp
+; SSE2-SSSE3-NEXT: .Lcfi0:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 16
+; SSE2-SSSE3-NEXT: pushq %r15
+; SSE2-SSSE3-NEXT: .Lcfi1:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 24
+; SSE2-SSSE3-NEXT: pushq %r14
+; SSE2-SSSE3-NEXT: .Lcfi2:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 32
+; SSE2-SSSE3-NEXT: pushq %r13
+; SSE2-SSSE3-NEXT: .Lcfi3:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 40
+; SSE2-SSSE3-NEXT: pushq %r12
+; SSE2-SSSE3-NEXT: .Lcfi4:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 48
+; SSE2-SSSE3-NEXT: pushq %rbx
+; SSE2-SSSE3-NEXT: .Lcfi5:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 56
+; SSE2-SSSE3-NEXT: .Lcfi6:
+; SSE2-SSSE3-NEXT: .cfi_offset %rbx, -56
+; SSE2-SSSE3-NEXT: .Lcfi7:
+; SSE2-SSSE3-NEXT: .cfi_offset %r12, -48
+; SSE2-SSSE3-NEXT: .Lcfi8:
+; SSE2-SSSE3-NEXT: .cfi_offset %r13, -40
+; SSE2-SSSE3-NEXT: .Lcfi9:
+; SSE2-SSSE3-NEXT: .cfi_offset %r14, -32
+; SSE2-SSSE3-NEXT: .Lcfi10:
+; SSE2-SSSE3-NEXT: .cfi_offset %r15, -24
+; SSE2-SSSE3-NEXT: .Lcfi11:
+; SSE2-SSSE3-NEXT: .cfi_offset %rbp, -16
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rax
+; SSE2-SSSE3-NEXT: movq %rax, %r8
+; SSE2-SSSE3-NEXT: movq %rax, %r9
+; SSE2-SSSE3-NEXT: movq %rax, %r10
+; SSE2-SSSE3-NEXT: movq %rax, %r11
+; SSE2-SSSE3-NEXT: movq %rax, %r14
+; SSE2-SSSE3-NEXT: movq %rax, %r15
+; SSE2-SSSE3-NEXT: movq %rax, %r12
+; SSE2-SSSE3-NEXT: movq %rax, %r13
+; SSE2-SSSE3-NEXT: movq %rax, %rbx
+; SSE2-SSSE3-NEXT: movq %rax, %rcx
+; SSE2-SSSE3-NEXT: movq %rax, %rdx
+; SSE2-SSSE3-NEXT: movq %rax, %rsi
+; SSE2-SSSE3-NEXT: movq %rax, %rdi
+; SSE2-SSSE3-NEXT: movq %rax, %rbp
+; SSE2-SSSE3-NEXT: shrq $15, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm0
+; SSE2-SSSE3-NEXT: movq %rax, %rbp
+; SSE2-SSSE3-NEXT: movsbq %al, %rax
+; SSE2-SSSE3-NEXT: shlq $49, %r8
+; SSE2-SSSE3-NEXT: sarq $63, %r8
+; SSE2-SSSE3-NEXT: movd %r8d, %xmm1
+; SSE2-SSSE3-NEXT: shlq $50, %r9
+; SSE2-SSSE3-NEXT: sarq $63, %r9
+; SSE2-SSSE3-NEXT: movd %r9d, %xmm2
+; SSE2-SSSE3-NEXT: shlq $51, %r10
+; SSE2-SSSE3-NEXT: sarq $63, %r10
+; SSE2-SSSE3-NEXT: movd %r10d, %xmm3
+; SSE2-SSSE3-NEXT: shlq $52, %r11
+; SSE2-SSSE3-NEXT: sarq $63, %r11
+; SSE2-SSSE3-NEXT: movd %r11d, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: shlq $53, %r14
+; SSE2-SSSE3-NEXT: sarq $63, %r14
+; SSE2-SSSE3-NEXT: movd %r14d, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-SSSE3-NEXT: shlq $54, %r15
+; SSE2-SSSE3-NEXT: sarq $63, %r15
+; SSE2-SSSE3-NEXT: movd %r15d, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-SSSE3-NEXT: shlq $55, %r12
+; SSE2-SSSE3-NEXT: sarq $63, %r12
+; SSE2-SSSE3-NEXT: movd %r12d, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-SSSE3-NEXT: shlq $60, %r13
+; SSE2-SSSE3-NEXT: sarq $63, %r13
+; SSE2-SSSE3-NEXT: movd %r13d, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-SSSE3-NEXT: shlq $61, %rbx
+; SSE2-SSSE3-NEXT: sarq $63, %rbx
+; SSE2-SSSE3-NEXT: movd %ebx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-SSSE3-NEXT: shlq $62, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-SSSE3-NEXT: shlq $63, %rdx
+; SSE2-SSSE3-NEXT: sarq $63, %rdx
+; SSE2-SSSE3-NEXT: movd %edx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-SSSE3-NEXT: shlq $58, %rsi
+; SSE2-SSSE3-NEXT: sarq $63, %rsi
+; SSE2-SSSE3-NEXT: movd %esi, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSE2-SSSE3-NEXT: shlq $59, %rdi
+; SSE2-SSSE3-NEXT: sarq $63, %rdi
+; SSE2-SSSE3-NEXT: movd %edi, %xmm4
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-SSSE3-NEXT: shlq $57, %rbp
+; SSE2-SSSE3-NEXT: sarq $63, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm2
+; SSE2-SSSE3-NEXT: shrq $7, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: popq %rbx
+; SSE2-SSSE3-NEXT: popq %r12
+; SSE2-SSSE3-NEXT: popq %r13
+; SSE2-SSSE3-NEXT: popq %r14
+; SSE2-SSSE3-NEXT: popq %r15
+; SSE2-SSSE3-NEXT: popq %rbp
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: ext_i16_16i8:
+; AVX12: # BB#0:
+; AVX12-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT: movswq -{{[0-9]+}}(%rsp), %rax
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $62, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: movq %rax, %rdx
+; AVX12-NEXT: shlq $63, %rdx
+; AVX12-NEXT: sarq $63, %rdx
+; AVX12-NEXT: vmovd %edx, %xmm0
+; AVX12-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $61, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $60, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $59, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $58, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $57, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movsbq %al, %rcx
+; AVX12-NEXT: shrq $7, %rcx
+; AVX12-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $55, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $54, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $53, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $52, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $51, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $50, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movq %rax, %rcx
+; AVX12-NEXT: shlq $49, %rcx
+; AVX12-NEXT: sarq $63, %rcx
+; AVX12-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: shrq $15, %rax
+; AVX12-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: ext_i16_16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k0
+; AVX512-NEXT: vpmovm2b %k0, %xmm0
+; AVX512-NEXT: retq
+ %1 = bitcast i16 %a0 to <16 x i1>
+ %2 = sext <16 x i1> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; 256-bit vectors
+;
+
+define <4 x i64> @ext_i4_4i64(i4 %a0) {
+; SSE2-SSSE3-LABEL: ext_i4_4i64:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: andb $15, %dil
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-SSSE3-NEXT: movd %eax, %xmm2
+; SSE2-SSSE3-NEXT: shrl %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-SSSE3-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
+; SSE2-SSSE3-NEXT: psllq $63, %xmm0
+; SSE2-SSSE3-NEXT: psrad $31, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
+; SSE2-SSSE3-NEXT: psllq $63, %xmm1
+; SSE2-SSSE3-NEXT: psrad $31, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i4_4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: andb $15, %dil
+; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $60, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $61, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $62, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: shlq $63, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i4_4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: andb $15, %dil
+; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $60, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $61, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $62, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: shlq $63, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i4_4i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: andb $15, %dil
+; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: retq
+ %1 = bitcast i4 %a0 to <4 x i1>
+ %2 = sext <4 x i1> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
+
+define <8 x i32> @ext_i8_8i32(i8 %a0) {
+; SSE2-SSSE3-LABEL: ext_i8_8i32:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: shrl $7, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm3
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-SSSE3-NEXT: pslld $31, %xmm0
+; SSE2-SSSE3-NEXT: psrad $31, %xmm0
+; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: pslld $31, %xmm1
+; SSE2-SSSE3-NEXT: psrad $31, %xmm1
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i8_8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movsbq -{{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $58, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: movq %rax, %rdx
+; AVX1-NEXT: shlq $59, %rdx
+; AVX1-NEXT: sarq $63, %rdx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $57, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $7, %rcx
+; AVX1-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $62, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: movq %rax, %rdx
+; AVX1-NEXT: shlq $63, %rdx
+; AVX1-NEXT: sarq $63, %rdx
+; AVX1-NEXT: vmovd %edx, %xmm1
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $61, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $60, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i8_8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movsbq -{{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $58, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: movq %rax, %rdx
+; AVX2-NEXT: shlq $59, %rdx
+; AVX2-NEXT: sarq $63, %rdx
+; AVX2-NEXT: vmovd %edx, %xmm0
+; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $57, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $7, %rcx
+; AVX2-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $62, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: movq %rax, %rdx
+; AVX2-NEXT: shlq $63, %rdx
+; AVX2-NEXT: sarq $63, %rdx
+; AVX2-NEXT: vmovd %edx, %xmm1
+; AVX2-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $61, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $60, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i8_8i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %1 = bitcast i8 %a0 to <8 x i1>
+ %2 = sext <8 x i1> %1 to <8 x i32>
+ ret <8 x i32> %2
+}
+
+define <16 x i16> @ext_i16_16i16(i16 %a0) {
+; SSE2-SSSE3-LABEL: ext_i16_16i16:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-SSSE3-NEXT: psllw $15, %xmm0
+; SSE2-SSSE3-NEXT: psraw $15, %xmm0
+; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-SSSE3-NEXT: psllw $15, %xmm1
+; SSE2-SSSE3-NEXT: psraw $15, %xmm1
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i16_16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi0:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %r15
+; AVX1-NEXT: .Lcfi1:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: .Lcfi2:
+; AVX1-NEXT: .cfi_def_cfa_offset 32
+; AVX1-NEXT: pushq %r13
+; AVX1-NEXT: .Lcfi3:
+; AVX1-NEXT: .cfi_def_cfa_offset 40
+; AVX1-NEXT: pushq %r12
+; AVX1-NEXT: .Lcfi4:
+; AVX1-NEXT: .cfi_def_cfa_offset 48
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: .Lcfi5:
+; AVX1-NEXT: .cfi_def_cfa_offset 56
+; AVX1-NEXT: .Lcfi6:
+; AVX1-NEXT: .cfi_offset %rbx, -56
+; AVX1-NEXT: .Lcfi7:
+; AVX1-NEXT: .cfi_offset %r12, -48
+; AVX1-NEXT: .Lcfi8:
+; AVX1-NEXT: .cfi_offset %r13, -40
+; AVX1-NEXT: .Lcfi9:
+; AVX1-NEXT: .cfi_offset %r14, -32
+; AVX1-NEXT: .Lcfi10:
+; AVX1-NEXT: .cfi_offset %r15, -24
+; AVX1-NEXT: .Lcfi11:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movswq -{{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $55, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: movq %rax, %r8
+; AVX1-NEXT: movq %rax, %r10
+; AVX1-NEXT: movq %rax, %r11
+; AVX1-NEXT: movq %rax, %r14
+; AVX1-NEXT: movq %rax, %r15
+; AVX1-NEXT: movq %rax, %r9
+; AVX1-NEXT: movq %rax, %r12
+; AVX1-NEXT: movq %rax, %r13
+; AVX1-NEXT: movq %rax, %rbx
+; AVX1-NEXT: movq %rax, %rdi
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: movq %rax, %rdx
+; AVX1-NEXT: movq %rax, %rsi
+; AVX1-NEXT: movsbq %al, %rbp
+; AVX1-NEXT: shlq $54, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: shlq $53, %r8
+; AVX1-NEXT: sarq $63, %r8
+; AVX1-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0
+; AVX1-NEXT: shlq $52, %r10
+; AVX1-NEXT: sarq $63, %r10
+; AVX1-NEXT: vpinsrw $3, %r10d, %xmm0, %xmm0
+; AVX1-NEXT: shlq $51, %r11
+; AVX1-NEXT: sarq $63, %r11
+; AVX1-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0
+; AVX1-NEXT: shlq $50, %r14
+; AVX1-NEXT: sarq $63, %r14
+; AVX1-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0
+; AVX1-NEXT: shlq $49, %r15
+; AVX1-NEXT: sarq $63, %r15
+; AVX1-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0
+; AVX1-NEXT: shrq $15, %r9
+; AVX1-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0
+; AVX1-NEXT: shlq $63, %r13
+; AVX1-NEXT: sarq $63, %r13
+; AVX1-NEXT: vmovd %r13d, %xmm1
+; AVX1-NEXT: shlq $62, %r12
+; AVX1-NEXT: sarq $63, %r12
+; AVX1-NEXT: vpinsrw $1, %r12d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $61, %rbx
+; AVX1-NEXT: sarq $63, %rbx
+; AVX1-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $60, %rdi
+; AVX1-NEXT: sarq $63, %rdi
+; AVX1-NEXT: vpinsrw $3, %edi, %xmm1, %xmm1
+; AVX1-NEXT: shlq $59, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $58, %rdx
+; AVX1-NEXT: sarq $63, %rdx
+; AVX1-NEXT: vpinsrw $5, %edx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $57, %rsi
+; AVX1-NEXT: sarq $63, %rsi
+; AVX1-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1
+; AVX1-NEXT: shrq $7, %rbp
+; AVX1-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r12
+; AVX1-NEXT: popq %r13
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i16_16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Lcfi0:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: .Lcfi1:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: .Lcfi2:
+; AVX2-NEXT: .cfi_def_cfa_offset 32
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: .Lcfi3:
+; AVX2-NEXT: .cfi_def_cfa_offset 40
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: .Lcfi4:
+; AVX2-NEXT: .cfi_def_cfa_offset 48
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: .Lcfi5:
+; AVX2-NEXT: .cfi_def_cfa_offset 56
+; AVX2-NEXT: .Lcfi6:
+; AVX2-NEXT: .cfi_offset %rbx, -56
+; AVX2-NEXT: .Lcfi7:
+; AVX2-NEXT: .cfi_offset %r12, -48
+; AVX2-NEXT: .Lcfi8:
+; AVX2-NEXT: .cfi_offset %r13, -40
+; AVX2-NEXT: .Lcfi9:
+; AVX2-NEXT: .cfi_offset %r14, -32
+; AVX2-NEXT: .Lcfi10:
+; AVX2-NEXT: .cfi_offset %r15, -24
+; AVX2-NEXT: .Lcfi11:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movswq -{{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $55, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: movq %rax, %r8
+; AVX2-NEXT: movq %rax, %r10
+; AVX2-NEXT: movq %rax, %r11
+; AVX2-NEXT: movq %rax, %r14
+; AVX2-NEXT: movq %rax, %r15
+; AVX2-NEXT: movq %rax, %r9
+; AVX2-NEXT: movq %rax, %r12
+; AVX2-NEXT: movq %rax, %r13
+; AVX2-NEXT: movq %rax, %rbx
+; AVX2-NEXT: movq %rax, %rdi
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: movq %rax, %rdx
+; AVX2-NEXT: movq %rax, %rsi
+; AVX2-NEXT: movsbq %al, %rbp
+; AVX2-NEXT: shlq $54, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: shlq $53, %r8
+; AVX2-NEXT: sarq $63, %r8
+; AVX2-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0
+; AVX2-NEXT: shlq $52, %r10
+; AVX2-NEXT: sarq $63, %r10
+; AVX2-NEXT: vpinsrw $3, %r10d, %xmm0, %xmm0
+; AVX2-NEXT: shlq $51, %r11
+; AVX2-NEXT: sarq $63, %r11
+; AVX2-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0
+; AVX2-NEXT: shlq $50, %r14
+; AVX2-NEXT: sarq $63, %r14
+; AVX2-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0
+; AVX2-NEXT: shlq $49, %r15
+; AVX2-NEXT: sarq $63, %r15
+; AVX2-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0
+; AVX2-NEXT: shrq $15, %r9
+; AVX2-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0
+; AVX2-NEXT: shlq $63, %r13
+; AVX2-NEXT: sarq $63, %r13
+; AVX2-NEXT: vmovd %r13d, %xmm1
+; AVX2-NEXT: shlq $62, %r12
+; AVX2-NEXT: sarq $63, %r12
+; AVX2-NEXT: vpinsrw $1, %r12d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $61, %rbx
+; AVX2-NEXT: sarq $63, %rbx
+; AVX2-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $60, %rdi
+; AVX2-NEXT: sarq $63, %rdi
+; AVX2-NEXT: vpinsrw $3, %edi, %xmm1, %xmm1
+; AVX2-NEXT: shlq $59, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $58, %rdx
+; AVX2-NEXT: sarq $63, %rdx
+; AVX2-NEXT: vpinsrw $5, %edx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $57, %rsi
+; AVX2-NEXT: sarq $63, %rsi
+; AVX2-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1
+; AVX2-NEXT: shrq $7, %rbp
+; AVX2-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i16_16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k0
+; AVX512-NEXT: vpmovm2w %k0, %ymm0
+; AVX512-NEXT: retq
+ %1 = bitcast i16 %a0 to <16 x i1>
+ %2 = sext <16 x i1> %1 to <16 x i16>
+ ret <16 x i16> %2
+}
+
+define <32 x i8> @ext_i32_32i8(i32 %a0) {
+; SSE2-SSSE3-LABEL: ext_i32_32i8:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: pushq %rbp
+; SSE2-SSSE3-NEXT: .Lcfi12:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 16
+; SSE2-SSSE3-NEXT: pushq %r15
+; SSE2-SSSE3-NEXT: .Lcfi13:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 24
+; SSE2-SSSE3-NEXT: pushq %r14
+; SSE2-SSSE3-NEXT: .Lcfi14:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 32
+; SSE2-SSSE3-NEXT: pushq %r13
+; SSE2-SSSE3-NEXT: .Lcfi15:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 40
+; SSE2-SSSE3-NEXT: pushq %r12
+; SSE2-SSSE3-NEXT: .Lcfi16:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 48
+; SSE2-SSSE3-NEXT: pushq %rbx
+; SSE2-SSSE3-NEXT: .Lcfi17:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 56
+; SSE2-SSSE3-NEXT: .Lcfi18:
+; SSE2-SSSE3-NEXT: .cfi_offset %rbx, -56
+; SSE2-SSSE3-NEXT: .Lcfi19:
+; SSE2-SSSE3-NEXT: .cfi_offset %r12, -48
+; SSE2-SSSE3-NEXT: .Lcfi20:
+; SSE2-SSSE3-NEXT: .cfi_offset %r13, -40
+; SSE2-SSSE3-NEXT: .Lcfi21:
+; SSE2-SSSE3-NEXT: .cfi_offset %r14, -32
+; SSE2-SSSE3-NEXT: .Lcfi22:
+; SSE2-SSSE3-NEXT: .cfi_offset %r15, -24
+; SSE2-SSSE3-NEXT: .Lcfi23:
+; SSE2-SSSE3-NEXT: .cfi_offset %rbp, -16
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: shrl $16, %edi
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rbx
+; SSE2-SSSE3-NEXT: movq %rbx, %r8
+; SSE2-SSSE3-NEXT: movq %rbx, %r9
+; SSE2-SSSE3-NEXT: movq %rbx, %r10
+; SSE2-SSSE3-NEXT: movq %rbx, %r11
+; SSE2-SSSE3-NEXT: movq %rbx, %r14
+; SSE2-SSSE3-NEXT: movq %rbx, %r15
+; SSE2-SSSE3-NEXT: movq %rbx, %r12
+; SSE2-SSSE3-NEXT: movq %rbx, %r13
+; SSE2-SSSE3-NEXT: movq %rbx, %rdi
+; SSE2-SSSE3-NEXT: movq %rbx, %rcx
+; SSE2-SSSE3-NEXT: movq %rbx, %rdx
+; SSE2-SSSE3-NEXT: movq %rbx, %rbp
+; SSE2-SSSE3-NEXT: movq %rbx, %rsi
+; SSE2-SSSE3-NEXT: movq %rbx, %rax
+; SSE2-SSSE3-NEXT: shrq $15, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: movq %rbx, %rax
+; SSE2-SSSE3-NEXT: movsbq %bl, %rbx
+; SSE2-SSSE3-NEXT: shlq $49, %r8
+; SSE2-SSSE3-NEXT: sarq $63, %r8
+; SSE2-SSSE3-NEXT: movd %r8d, %xmm15
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-SSSE3-NEXT: shlq $50, %r9
+; SSE2-SSSE3-NEXT: sarq $63, %r9
+; SSE2-SSSE3-NEXT: movd %r9d, %xmm8
+; SSE2-SSSE3-NEXT: shlq $51, %r10
+; SSE2-SSSE3-NEXT: sarq $63, %r10
+; SSE2-SSSE3-NEXT: movd %r10d, %xmm3
+; SSE2-SSSE3-NEXT: shlq $52, %r11
+; SSE2-SSSE3-NEXT: sarq $63, %r11
+; SSE2-SSSE3-NEXT: movd %r11d, %xmm9
+; SSE2-SSSE3-NEXT: shlq $53, %r14
+; SSE2-SSSE3-NEXT: sarq $63, %r14
+; SSE2-SSSE3-NEXT: movd %r14d, %xmm6
+; SSE2-SSSE3-NEXT: shlq $54, %r15
+; SSE2-SSSE3-NEXT: sarq $63, %r15
+; SSE2-SSSE3-NEXT: movd %r15d, %xmm10
+; SSE2-SSSE3-NEXT: shlq $55, %r12
+; SSE2-SSSE3-NEXT: sarq $63, %r12
+; SSE2-SSSE3-NEXT: movd %r12d, %xmm1
+; SSE2-SSSE3-NEXT: shlq $60, %r13
+; SSE2-SSSE3-NEXT: sarq $63, %r13
+; SSE2-SSSE3-NEXT: movd %r13d, %xmm11
+; SSE2-SSSE3-NEXT: shlq $61, %rdi
+; SSE2-SSSE3-NEXT: sarq $63, %rdi
+; SSE2-SSSE3-NEXT: movd %edi, %xmm5
+; SSE2-SSSE3-NEXT: shlq $62, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm12
+; SSE2-SSSE3-NEXT: shlq $63, %rdx
+; SSE2-SSSE3-NEXT: sarq $63, %rdx
+; SSE2-SSSE3-NEXT: movd %edx, %xmm0
+; SSE2-SSSE3-NEXT: shlq $58, %rbp
+; SSE2-SSSE3-NEXT: sarq $63, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm13
+; SSE2-SSSE3-NEXT: shlq $59, %rsi
+; SSE2-SSSE3-NEXT: sarq $63, %rsi
+; SSE2-SSSE3-NEXT: movd %esi, %xmm7
+; SSE2-SSSE3-NEXT: shlq $57, %rax
+; SSE2-SSSE3-NEXT: sarq $63, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: shrq $7, %rbx
+; SSE2-SSSE3-NEXT: movd %ebx, %xmm14
+; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rsi
+; SSE2-SSSE3-NEXT: movq %rsi, %r8
+; SSE2-SSSE3-NEXT: movq %rsi, %r9
+; SSE2-SSSE3-NEXT: movq %rsi, %r10
+; SSE2-SSSE3-NEXT: movq %rsi, %r11
+; SSE2-SSSE3-NEXT: movq %rsi, %r14
+; SSE2-SSSE3-NEXT: movq %rsi, %r15
+; SSE2-SSSE3-NEXT: movq %rsi, %r12
+; SSE2-SSSE3-NEXT: movq %rsi, %r13
+; SSE2-SSSE3-NEXT: movq %rsi, %rbx
+; SSE2-SSSE3-NEXT: movq %rsi, %rax
+; SSE2-SSSE3-NEXT: movq %rsi, %rcx
+; SSE2-SSSE3-NEXT: movq %rsi, %rdx
+; SSE2-SSSE3-NEXT: movq %rsi, %rdi
+; SSE2-SSSE3-NEXT: movq %rsi, %rbp
+; SSE2-SSSE3-NEXT: shrq $15, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm2
+; SSE2-SSSE3-NEXT: movq %rsi, %rbp
+; SSE2-SSSE3-NEXT: movsbq %sil, %rsi
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
+; SSE2-SSSE3-NEXT: shlq $49, %r8
+; SSE2-SSSE3-NEXT: sarq $63, %r8
+; SSE2-SSSE3-NEXT: movd %r8d, %xmm3
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; SSE2-SSSE3-NEXT: shlq $50, %r9
+; SSE2-SSSE3-NEXT: sarq $63, %r9
+; SSE2-SSSE3-NEXT: movd %r9d, %xmm4
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
+; SSE2-SSSE3-NEXT: shlq $51, %r10
+; SSE2-SSSE3-NEXT: sarq $63, %r10
+; SSE2-SSSE3-NEXT: movd %r10d, %xmm5
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: shlq $52, %r11
+; SSE2-SSSE3-NEXT: sarq $63, %r11
+; SSE2-SSSE3-NEXT: movd %r11d, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-SSSE3-NEXT: shlq $53, %r14
+; SSE2-SSSE3-NEXT: sarq $63, %r14
+; SSE2-SSSE3-NEXT: movd %r14d, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE2-SSSE3-NEXT: shlq $54, %r15
+; SSE2-SSSE3-NEXT: sarq $63, %r15
+; SSE2-SSSE3-NEXT: movd %r15d, %xmm4
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; SSE2-SSSE3-NEXT: shlq $55, %r12
+; SSE2-SSSE3-NEXT: sarq $63, %r12
+; SSE2-SSSE3-NEXT: movd %r12d, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: shlq $60, %r13
+; SSE2-SSSE3-NEXT: sarq $63, %r13
+; SSE2-SSSE3-NEXT: movd %r13d, %xmm6
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-SSSE3-NEXT: shlq $61, %rbx
+; SSE2-SSSE3-NEXT: sarq $63, %rbx
+; SSE2-SSSE3-NEXT: movd %ebx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-SSSE3-NEXT: shlq $62, %rax
+; SSE2-SSSE3-NEXT: sarq $63, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm2
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; SSE2-SSSE3-NEXT: shlq $63, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; SSE2-SSSE3-NEXT: shlq $58, %rdx
+; SSE2-SSSE3-NEXT: sarq $63, %rdx
+; SSE2-SSSE3-NEXT: movd %edx, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-SSSE3-NEXT: shlq $59, %rdi
+; SSE2-SSSE3-NEXT: sarq $63, %rdi
+; SSE2-SSSE3-NEXT: movd %edi, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; SSE2-SSSE3-NEXT: shlq $57, %rbp
+; SSE2-SSSE3-NEXT: sarq $63, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm4
+; SSE2-SSSE3-NEXT: shrq $7, %rsi
+; SSE2-SSSE3-NEXT: movd %esi, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-SSSE3-NEXT: popq %rbx
+; SSE2-SSSE3-NEXT: popq %r12
+; SSE2-SSSE3-NEXT: popq %r13
+; SSE2-SSSE3-NEXT: popq %r14
+; SSE2-SSSE3-NEXT: popq %r15
+; SSE2-SSSE3-NEXT: popq %rbp
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i32_32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi12:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi13:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi14:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: pushq %r15
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: pushq %r13
+; AVX1-NEXT: pushq %r12
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: .Lcfi15:
+; AVX1-NEXT: .cfi_offset %rbx, -56
+; AVX1-NEXT: .Lcfi16:
+; AVX1-NEXT: .cfi_offset %r12, -48
+; AVX1-NEXT: .Lcfi17:
+; AVX1-NEXT: .cfi_offset %r13, -40
+; AVX1-NEXT: .Lcfi18:
+; AVX1-NEXT: .cfi_offset %r14, -32
+; AVX1-NEXT: .Lcfi19:
+; AVX1-NEXT: .cfi_offset %r15, -24
+; AVX1-NEXT: movl %edi, (%rsp)
+; AVX1-NEXT: movslq (%rsp), %rdx
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: shlq $47, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: movq %rdx, %rdi
+; AVX1-NEXT: movq %rdx, %r13
+; AVX1-NEXT: movq %rdx, %rsi
+; AVX1-NEXT: movq %rdx, %r10
+; AVX1-NEXT: movq %rdx, %r11
+; AVX1-NEXT: movq %rdx, %r9
+; AVX1-NEXT: movq %rdx, %rbx
+; AVX1-NEXT: movq %rdx, %r14
+; AVX1-NEXT: movq %rdx, %r15
+; AVX1-NEXT: movq %rdx, %r12
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shlq $46, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX1-NEXT: shlq $45, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: shlq $44, %r8
+; AVX1-NEXT: sarq $63, %r8
+; AVX1-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: shlq $43, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: shlq $42, %rdi
+; AVX1-NEXT: sarq $63, %rdi
+; AVX1-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %rdi
+; AVX1-NEXT: shlq $41, %r13
+; AVX1-NEXT: sarq $63, %r13
+; AVX1-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r13
+; AVX1-NEXT: shlq $40, %rsi
+; AVX1-NEXT: sarq $63, %rsi
+; AVX1-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %rsi
+; AVX1-NEXT: shlq $39, %r10
+; AVX1-NEXT: sarq $63, %r10
+; AVX1-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r10
+; AVX1-NEXT: shlq $38, %r11
+; AVX1-NEXT: sarq $63, %r11
+; AVX1-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0
+; AVX1-NEXT: movsbq %dl, %rax
+; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: shlq $37, %r9
+; AVX1-NEXT: sarq $63, %r9
+; AVX1-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r9
+; AVX1-NEXT: shlq $36, %rbx
+; AVX1-NEXT: sarq $63, %rbx
+; AVX1-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %rbx
+; AVX1-NEXT: shlq $35, %r14
+; AVX1-NEXT: sarq $63, %r14
+; AVX1-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r14
+; AVX1-NEXT: shlq $34, %r15
+; AVX1-NEXT: sarq $63, %r15
+; AVX1-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r15
+; AVX1-NEXT: shlq $33, %r12
+; AVX1-NEXT: sarq $63, %r12
+; AVX1-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r12
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX1-NEXT: shrq $31, %rax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shlq $63, %r8
+; AVX1-NEXT: sarq $63, %r8
+; AVX1-NEXT: vmovd %r8d, %xmm1
+; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: movswq %dx, %rdx
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11 # 8-byte Reload
+; AVX1-NEXT: shlq $62, %r11
+; AVX1-NEXT: sarq $63, %r11
+; AVX1-NEXT: vpinsrb $1, %r11d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $61, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $60, %rdi
+; AVX1-NEXT: sarq $63, %rdi
+; AVX1-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1
+; AVX1-NEXT: shlq $59, %r13
+; AVX1-NEXT: sarq $63, %r13
+; AVX1-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $58, %rsi
+; AVX1-NEXT: sarq $63, %rsi
+; AVX1-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1
+; AVX1-NEXT: shlq $57, %r10
+; AVX1-NEXT: sarq $63, %r10
+; AVX1-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; AVX1-NEXT: shrq $7, %rcx
+; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $55, %r9
+; AVX1-NEXT: sarq $63, %r9
+; AVX1-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $54, %rbx
+; AVX1-NEXT: sarq $63, %rbx
+; AVX1-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $53, %r14
+; AVX1-NEXT: sarq $63, %r14
+; AVX1-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $52, %r15
+; AVX1-NEXT: sarq $63, %r15
+; AVX1-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $51, %r12
+; AVX1-NEXT: sarq $63, %r12
+; AVX1-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $50, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX1-NEXT: shlq $49, %r8
+; AVX1-NEXT: sarq $63, %r8
+; AVX1-NEXT: vpinsrb $14, %r8d, %xmm1, %xmm1
+; AVX1-NEXT: shrq $15, %rdx
+; AVX1-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: leaq -40(%rbp), %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r12
+; AVX1-NEXT: popq %r13
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i32_32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Lcfi12:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .Lcfi13:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .Lcfi14:
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: .Lcfi15:
+; AVX2-NEXT: .cfi_offset %rbx, -56
+; AVX2-NEXT: .Lcfi16:
+; AVX2-NEXT: .cfi_offset %r12, -48
+; AVX2-NEXT: .Lcfi17:
+; AVX2-NEXT: .cfi_offset %r13, -40
+; AVX2-NEXT: .Lcfi18:
+; AVX2-NEXT: .cfi_offset %r14, -32
+; AVX2-NEXT: .Lcfi19:
+; AVX2-NEXT: .cfi_offset %r15, -24
+; AVX2-NEXT: movl %edi, (%rsp)
+; AVX2-NEXT: movslq (%rsp), %rdx
+; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: shlq $47, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: movq %rdx, %rdi
+; AVX2-NEXT: movq %rdx, %r13
+; AVX2-NEXT: movq %rdx, %rsi
+; AVX2-NEXT: movq %rdx, %r10
+; AVX2-NEXT: movq %rdx, %r11
+; AVX2-NEXT: movq %rdx, %r9
+; AVX2-NEXT: movq %rdx, %rbx
+; AVX2-NEXT: movq %rdx, %r14
+; AVX2-NEXT: movq %rdx, %r15
+; AVX2-NEXT: movq %rdx, %r12
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shlq $46, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX2-NEXT: shlq $45, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: shlq $44, %r8
+; AVX2-NEXT: sarq $63, %r8
+; AVX2-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: shlq $43, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: shlq $42, %rdi
+; AVX2-NEXT: sarq $63, %rdi
+; AVX2-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %rdi
+; AVX2-NEXT: shlq $41, %r13
+; AVX2-NEXT: sarq $63, %r13
+; AVX2-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r13
+; AVX2-NEXT: shlq $40, %rsi
+; AVX2-NEXT: sarq $63, %rsi
+; AVX2-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %rsi
+; AVX2-NEXT: shlq $39, %r10
+; AVX2-NEXT: sarq $63, %r10
+; AVX2-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r10
+; AVX2-NEXT: shlq $38, %r11
+; AVX2-NEXT: sarq $63, %r11
+; AVX2-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0
+; AVX2-NEXT: movsbq %dl, %rax
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: shlq $37, %r9
+; AVX2-NEXT: sarq $63, %r9
+; AVX2-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r9
+; AVX2-NEXT: shlq $36, %rbx
+; AVX2-NEXT: sarq $63, %rbx
+; AVX2-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %rbx
+; AVX2-NEXT: shlq $35, %r14
+; AVX2-NEXT: sarq $63, %r14
+; AVX2-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r14
+; AVX2-NEXT: shlq $34, %r15
+; AVX2-NEXT: sarq $63, %r15
+; AVX2-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r15
+; AVX2-NEXT: shlq $33, %r12
+; AVX2-NEXT: sarq $63, %r12
+; AVX2-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r12
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX2-NEXT: shrq $31, %rax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shlq $63, %r8
+; AVX2-NEXT: sarq $63, %r8
+; AVX2-NEXT: vmovd %r8d, %xmm1
+; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: movswq %dx, %rdx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 # 8-byte Reload
+; AVX2-NEXT: shlq $62, %r11
+; AVX2-NEXT: sarq $63, %r11
+; AVX2-NEXT: vpinsrb $1, %r11d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $61, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $60, %rdi
+; AVX2-NEXT: sarq $63, %rdi
+; AVX2-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1
+; AVX2-NEXT: shlq $59, %r13
+; AVX2-NEXT: sarq $63, %r13
+; AVX2-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $58, %rsi
+; AVX2-NEXT: sarq $63, %rsi
+; AVX2-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1
+; AVX2-NEXT: shlq $57, %r10
+; AVX2-NEXT: sarq $63, %r10
+; AVX2-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; AVX2-NEXT: shrq $7, %rcx
+; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $55, %r9
+; AVX2-NEXT: sarq $63, %r9
+; AVX2-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $54, %rbx
+; AVX2-NEXT: sarq $63, %rbx
+; AVX2-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $53, %r14
+; AVX2-NEXT: sarq $63, %r14
+; AVX2-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $52, %r15
+; AVX2-NEXT: sarq $63, %r15
+; AVX2-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $51, %r12
+; AVX2-NEXT: sarq $63, %r12
+; AVX2-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $50, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX2-NEXT: shlq $49, %r8
+; AVX2-NEXT: sarq $63, %r8
+; AVX2-NEXT: vpinsrb $14, %r8d, %xmm1, %xmm1
+; AVX2-NEXT: shrq $15, %rdx
+; AVX2-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: leaq -40(%rbp), %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i32_32i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k0
+; AVX512-NEXT: vpmovm2b %k0, %ymm0
+; AVX512-NEXT: retq
+ %1 = bitcast i32 %a0 to <32 x i1>
+ %2 = sext <32 x i1> %1 to <32 x i8>
+ ret <32 x i8> %2
+}
+
+;
+; 512-bit vectors
+;
+
+define <8 x i64> @ext_i8_8i64(i8 %a0) {
+; SSE2-SSSE3-LABEL: ext_i8_8i64:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: shrl $7, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3]
+; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; SSE2-SSSE3-NEXT: psllq $63, %xmm0
+; SSE2-SSSE3-NEXT: psrad $31, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3]
+; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7]
+; SSE2-SSSE3-NEXT: psllq $63, %xmm1
+; SSE2-SSSE3-NEXT: psrad $31, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3]
+; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7]
+; SSE2-SSSE3-NEXT: psllq $63, %xmm2
+; SSE2-SSSE3-NEXT: psrad $31, %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3]
+; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7]
+; SSE2-SSSE3-NEXT: psllq $63, %xmm3
+; SSE2-SSSE3-NEXT: psrad $31, %xmm3
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i8_8i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $3, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $4, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $5, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $6, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $7, %eax
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm1
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i8_8i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: movl %eax, %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: vmovd %edx, %xmm0
+; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $3, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $4, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $5, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $6, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $7, %eax
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm1
+; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i8_8i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %1 = bitcast i8 %a0 to <8 x i1>
+ %2 = sext <8 x i1> %1 to <8 x i64>
+ ret <8 x i64> %2
+}
+
+define <16 x i32> @ext_i16_16i32(i16 %a0) {
+; SSE2-SSSE3-LABEL: ext_i16_16i32:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-SSSE3-NEXT: pslld $31, %xmm0
+; SSE2-SSSE3-NEXT: psrad $31, %xmm0
+; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: pslld $31, %xmm1
+; SSE2-SSSE3-NEXT: psrad $31, %xmm1
+; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-SSSE3-NEXT: pslld $31, %xmm2
+; SSE2-SSSE3-NEXT: psrad $31, %xmm2
+; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-SSSE3-NEXT: pslld $31, %xmm3
+; SSE2-SSSE3-NEXT: psrad $31, %xmm3
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i16_16i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $3, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $4, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $5, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $6, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $7, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $8, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $9, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $10, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $11, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $12, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $13, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $14, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $15, %eax
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i16_16i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: movl %eax, %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: vmovd %edx, %xmm0
+; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $3, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $4, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $5, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $6, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $7, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $8, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $9, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $10, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $11, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $12, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $13, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $14, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $15, %eax
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
+; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpslld $31, %ymm1, %ymm1
+; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i16_16i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %1 = bitcast i16 %a0 to <16 x i1>
+ %2 = sext <16 x i1> %1 to <16 x i32>
+ ret <16 x i32> %2
+}
+
+define <32 x i16> @ext_i32_32i16(i32 %a0) {
+; SSE2-SSSE3-LABEL: ext_i32_32i16:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movl %edi, %eax
+; SSE2-SSSE3-NEXT: shrl $16, %eax
+; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-SSSE3-NEXT: psllw $15, %xmm0
+; SSE2-SSSE3-NEXT: psraw $15, %xmm0
+; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-SSSE3-NEXT: psllw $15, %xmm1
+; SSE2-SSSE3-NEXT: psraw $15, %xmm1
+; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: psllw $15, %xmm2
+; SSE2-SSSE3-NEXT: psraw $15, %xmm2
+; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-SSSE3-NEXT: psllw $15, %xmm3
+; SSE2-SSSE3-NEXT: psraw $15, %xmm3
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i32_32i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi20:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi21:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi22:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: pushq %r15
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: pushq %r13
+; AVX1-NEXT: pushq %r12
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $128, %rsp
+; AVX1-NEXT: .Lcfi23:
+; AVX1-NEXT: .cfi_offset %rbx, -56
+; AVX1-NEXT: .Lcfi24:
+; AVX1-NEXT: .cfi_offset %r12, -48
+; AVX1-NEXT: .Lcfi25:
+; AVX1-NEXT: .cfi_offset %r13, -40
+; AVX1-NEXT: .Lcfi26:
+; AVX1-NEXT: .cfi_offset %r14, -32
+; AVX1-NEXT: .Lcfi27:
+; AVX1-NEXT: .cfi_offset %r15, -24
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, %r13d
+; AVX1-NEXT: movl %edi, %r12d
+; AVX1-NEXT: movl %edi, %r15d
+; AVX1-NEXT: movl %edi, %r14d
+; AVX1-NEXT: movl %edi, %ebx
+; AVX1-NEXT: movl %edi, %r11d
+; AVX1-NEXT: movl %edi, %r10d
+; AVX1-NEXT: movl %edi, %r9d
+; AVX1-NEXT: movl %edi, %r8d
+; AVX1-NEXT: movl %edi, %esi
+; AVX1-NEXT: movl %edi, %edx
+; AVX1-NEXT: movl %edi, %ecx
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: andl $1, %edi
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: shrl %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $3, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $4, %esi
+; AVX1-NEXT: andl $1, %esi
+; AVX1-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; AVX1-NEXT: shrl $5, %r8d
+; AVX1-NEXT: andl $1, %r8d
+; AVX1-NEXT: vpinsrb $5, %r8d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $6, %r9d
+; AVX1-NEXT: andl $1, %r9d
+; AVX1-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $7, %r10d
+; AVX1-NEXT: andl $1, %r10d
+; AVX1-NEXT: vpinsrb $7, %r10d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $8, %r11d
+; AVX1-NEXT: andl $1, %r11d
+; AVX1-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $9, %ebx
+; AVX1-NEXT: andl $1, %ebx
+; AVX1-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $10, %r14d
+; AVX1-NEXT: andl $1, %r14d
+; AVX1-NEXT: vpinsrb $10, %r14d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $11, %r15d
+; AVX1-NEXT: andl $1, %r15d
+; AVX1-NEXT: vpinsrb $11, %r15d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $12, %r12d
+; AVX1-NEXT: andl $1, %r12d
+; AVX1-NEXT: vpinsrb $12, %r12d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $13, %r13d
+; AVX1-NEXT: andl $1, %r13d
+; AVX1-NEXT: vpinsrb $13, %r13d, %xmm0, %xmm0
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $14, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $15, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $17, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $18, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $19, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $20, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $21, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $22, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $23, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $24, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $25, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $26, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $27, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $28, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $29, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $30, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $31, %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpsllw $15, %xmm2, %xmm2
+; AVX1-NEXT: vpsraw $15, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpsllw $15, %xmm2, %xmm2
+; AVX1-NEXT: vpsraw $15, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpsllw $15, %xmm1, %xmm1
+; AVX1-NEXT: vpsraw $15, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: leaq -40(%rbp), %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r12
+; AVX1-NEXT: popq %r13
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i32_32i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Lcfi20:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .Lcfi21:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .Lcfi22:
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $128, %rsp
+; AVX2-NEXT: .Lcfi23:
+; AVX2-NEXT: .cfi_offset %rbx, -56
+; AVX2-NEXT: .Lcfi24:
+; AVX2-NEXT: .cfi_offset %r12, -48
+; AVX2-NEXT: .Lcfi25:
+; AVX2-NEXT: .cfi_offset %r13, -40
+; AVX2-NEXT: .Lcfi26:
+; AVX2-NEXT: .cfi_offset %r14, -32
+; AVX2-NEXT: .Lcfi27:
+; AVX2-NEXT: .cfi_offset %r15, -24
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, %r13d
+; AVX2-NEXT: movl %edi, %r12d
+; AVX2-NEXT: movl %edi, %r15d
+; AVX2-NEXT: movl %edi, %r14d
+; AVX2-NEXT: movl %edi, %ebx
+; AVX2-NEXT: movl %edi, %r11d
+; AVX2-NEXT: movl %edi, %r10d
+; AVX2-NEXT: movl %edi, %r9d
+; AVX2-NEXT: movl %edi, %r8d
+; AVX2-NEXT: movl %edi, %esi
+; AVX2-NEXT: movl %edi, %edx
+; AVX2-NEXT: movl %edi, %ecx
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: andl $1, %edi
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: shrl %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $3, %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $4, %esi
+; AVX2-NEXT: andl $1, %esi
+; AVX2-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; AVX2-NEXT: shrl $5, %r8d
+; AVX2-NEXT: andl $1, %r8d
+; AVX2-NEXT: vpinsrb $5, %r8d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $6, %r9d
+; AVX2-NEXT: andl $1, %r9d
+; AVX2-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $7, %r10d
+; AVX2-NEXT: andl $1, %r10d
+; AVX2-NEXT: vpinsrb $7, %r10d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $8, %r11d
+; AVX2-NEXT: andl $1, %r11d
+; AVX2-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $9, %ebx
+; AVX2-NEXT: andl $1, %ebx
+; AVX2-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $10, %r14d
+; AVX2-NEXT: andl $1, %r14d
+; AVX2-NEXT: vpinsrb $10, %r14d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $11, %r15d
+; AVX2-NEXT: andl $1, %r15d
+; AVX2-NEXT: vpinsrb $11, %r15d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $12, %r12d
+; AVX2-NEXT: andl $1, %r12d
+; AVX2-NEXT: vpinsrb $12, %r12d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $13, %r13d
+; AVX2-NEXT: andl $1, %r13d
+; AVX2-NEXT: vpinsrb $13, %r13d, %xmm0, %xmm0
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $14, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $15, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $17, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $18, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $19, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $20, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $21, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $22, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $23, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $24, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $25, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $26, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $27, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $28, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $29, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $30, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $31, %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0
+; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT: vpsllw $15, %ymm1, %ymm1
+; AVX2-NEXT: vpsraw $15, %ymm1, %ymm1
+; AVX2-NEXT: leaq -40(%rbp), %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i32_32i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k0
+; AVX512-NEXT: vpmovm2w %k0, %zmm0
+; AVX512-NEXT: retq
+ %1 = bitcast i32 %a0 to <32 x i1>
+ %2 = sext <32 x i1> %1 to <32 x i16>
+ ret <32 x i16> %2
+}
+
+define <64 x i8> @ext_i64_64i8(i64 %a0) {
+; SSE2-SSSE3-LABEL: ext_i64_64i8:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: pushq %rbp
+; SSE2-SSSE3-NEXT: .Lcfi24:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 16
+; SSE2-SSSE3-NEXT: pushq %r15
+; SSE2-SSSE3-NEXT: .Lcfi25:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 24
+; SSE2-SSSE3-NEXT: pushq %r14
+; SSE2-SSSE3-NEXT: .Lcfi26:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 32
+; SSE2-SSSE3-NEXT: pushq %r13
+; SSE2-SSSE3-NEXT: .Lcfi27:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 40
+; SSE2-SSSE3-NEXT: pushq %r12
+; SSE2-SSSE3-NEXT: .Lcfi28:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 48
+; SSE2-SSSE3-NEXT: pushq %rbx
+; SSE2-SSSE3-NEXT: .Lcfi29:
+; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 56
+; SSE2-SSSE3-NEXT: .Lcfi30:
+; SSE2-SSSE3-NEXT: .cfi_offset %rbx, -56
+; SSE2-SSSE3-NEXT: .Lcfi31:
+; SSE2-SSSE3-NEXT: .cfi_offset %r12, -48
+; SSE2-SSSE3-NEXT: .Lcfi32:
+; SSE2-SSSE3-NEXT: .cfi_offset %r13, -40
+; SSE2-SSSE3-NEXT: .Lcfi33:
+; SSE2-SSSE3-NEXT: .cfi_offset %r14, -32
+; SSE2-SSSE3-NEXT: .Lcfi34:
+; SSE2-SSSE3-NEXT: .cfi_offset %r15, -24
+; SSE2-SSSE3-NEXT: .Lcfi35:
+; SSE2-SSSE3-NEXT: .cfi_offset %rbp, -16
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movq %rdi, %rax
+; SSE2-SSSE3-NEXT: shrq $32, %rax
+; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movq %rdi, %rax
+; SSE2-SSSE3-NEXT: shrq $48, %rax
+; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: shrl $16, %edi
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rbx
+; SSE2-SSSE3-NEXT: movq %rbx, %r8
+; SSE2-SSSE3-NEXT: movq %rbx, %r9
+; SSE2-SSSE3-NEXT: movq %rbx, %r10
+; SSE2-SSSE3-NEXT: movq %rbx, %r11
+; SSE2-SSSE3-NEXT: movq %rbx, %r14
+; SSE2-SSSE3-NEXT: movq %rbx, %r15
+; SSE2-SSSE3-NEXT: movq %rbx, %r12
+; SSE2-SSSE3-NEXT: movq %rbx, %r13
+; SSE2-SSSE3-NEXT: movq %rbx, %rdi
+; SSE2-SSSE3-NEXT: movq %rbx, %rcx
+; SSE2-SSSE3-NEXT: movq %rbx, %rdx
+; SSE2-SSSE3-NEXT: movq %rbx, %rsi
+; SSE2-SSSE3-NEXT: movq %rbx, %rbp
+; SSE2-SSSE3-NEXT: movq %rbx, %rax
+; SSE2-SSSE3-NEXT: shrq $15, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: movq %rbx, %rax
+; SSE2-SSSE3-NEXT: movsbq %bl, %rbx
+; SSE2-SSSE3-NEXT: shlq $49, %r8
+; SSE2-SSSE3-NEXT: sarq $63, %r8
+; SSE2-SSSE3-NEXT: movd %r8d, %xmm15
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-SSSE3-NEXT: shlq $50, %r9
+; SSE2-SSSE3-NEXT: sarq $63, %r9
+; SSE2-SSSE3-NEXT: movd %r9d, %xmm8
+; SSE2-SSSE3-NEXT: shlq $51, %r10
+; SSE2-SSSE3-NEXT: sarq $63, %r10
+; SSE2-SSSE3-NEXT: movd %r10d, %xmm2
+; SSE2-SSSE3-NEXT: shlq $52, %r11
+; SSE2-SSSE3-NEXT: sarq $63, %r11
+; SSE2-SSSE3-NEXT: movd %r11d, %xmm9
+; SSE2-SSSE3-NEXT: shlq $53, %r14
+; SSE2-SSSE3-NEXT: sarq $63, %r14
+; SSE2-SSSE3-NEXT: movd %r14d, %xmm6
+; SSE2-SSSE3-NEXT: shlq $54, %r15
+; SSE2-SSSE3-NEXT: sarq $63, %r15
+; SSE2-SSSE3-NEXT: movd %r15d, %xmm10
+; SSE2-SSSE3-NEXT: shlq $55, %r12
+; SSE2-SSSE3-NEXT: sarq $63, %r12
+; SSE2-SSSE3-NEXT: movd %r12d, %xmm4
+; SSE2-SSSE3-NEXT: shlq $60, %r13
+; SSE2-SSSE3-NEXT: sarq $63, %r13
+; SSE2-SSSE3-NEXT: movd %r13d, %xmm11
+; SSE2-SSSE3-NEXT: shlq $61, %rdi
+; SSE2-SSSE3-NEXT: sarq $63, %rdi
+; SSE2-SSSE3-NEXT: movd %edi, %xmm5
+; SSE2-SSSE3-NEXT: shlq $62, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm12
+; SSE2-SSSE3-NEXT: shlq $63, %rdx
+; SSE2-SSSE3-NEXT: sarq $63, %rdx
+; SSE2-SSSE3-NEXT: movd %edx, %xmm0
+; SSE2-SSSE3-NEXT: shlq $58, %rsi
+; SSE2-SSSE3-NEXT: sarq $63, %rsi
+; SSE2-SSSE3-NEXT: movd %esi, %xmm13
+; SSE2-SSSE3-NEXT: shlq $59, %rbp
+; SSE2-SSSE3-NEXT: sarq $63, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm7
+; SSE2-SSSE3-NEXT: shlq $57, %rax
+; SSE2-SSSE3-NEXT: sarq $63, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm3
+; SSE2-SSSE3-NEXT: shrq $7, %rbx
+; SSE2-SSSE3-NEXT: movd %ebx, %xmm14
+; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rsi
+; SSE2-SSSE3-NEXT: movq %rsi, %r8
+; SSE2-SSSE3-NEXT: movq %rsi, %r9
+; SSE2-SSSE3-NEXT: movq %rsi, %r10
+; SSE2-SSSE3-NEXT: movq %rsi, %r11
+; SSE2-SSSE3-NEXT: movq %rsi, %r14
+; SSE2-SSSE3-NEXT: movq %rsi, %r15
+; SSE2-SSSE3-NEXT: movq %rsi, %r12
+; SSE2-SSSE3-NEXT: movq %rsi, %r13
+; SSE2-SSSE3-NEXT: movq %rsi, %rbx
+; SSE2-SSSE3-NEXT: movq %rsi, %rax
+; SSE2-SSSE3-NEXT: movq %rsi, %rcx
+; SSE2-SSSE3-NEXT: movq %rsi, %rdx
+; SSE2-SSSE3-NEXT: movq %rsi, %rdi
+; SSE2-SSSE3-NEXT: movq %rsi, %rbp
+; SSE2-SSSE3-NEXT: shrq $15, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm1
+; SSE2-SSSE3-NEXT: movq %rsi, %rbp
+; SSE2-SSSE3-NEXT: movsbq %sil, %rsi
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE2-SSSE3-NEXT: shlq $49, %r8
+; SSE2-SSSE3-NEXT: sarq $63, %r8
+; SSE2-SSSE3-NEXT: movd %r8d, %xmm13
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
+; SSE2-SSSE3-NEXT: shlq $50, %r9
+; SSE2-SSSE3-NEXT: sarq $63, %r9
+; SSE2-SSSE3-NEXT: movd %r9d, %xmm1
+; SSE2-SSSE3-NEXT: shlq $51, %r10
+; SSE2-SSSE3-NEXT: sarq $63, %r10
+; SSE2-SSSE3-NEXT: movd %r10d, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-SSSE3-NEXT: shlq $52, %r11
+; SSE2-SSSE3-NEXT: sarq $63, %r11
+; SSE2-SSSE3-NEXT: movd %r11d, %xmm8
+; SSE2-SSSE3-NEXT: shlq $53, %r14
+; SSE2-SSSE3-NEXT: sarq $63, %r14
+; SSE2-SSSE3-NEXT: movd %r14d, %xmm15
+; SSE2-SSSE3-NEXT: shlq $54, %r15
+; SSE2-SSSE3-NEXT: sarq $63, %r15
+; SSE2-SSSE3-NEXT: movd %r15d, %xmm9
+; SSE2-SSSE3-NEXT: shlq $55, %r12
+; SSE2-SSSE3-NEXT: sarq $63, %r12
+; SSE2-SSSE3-NEXT: movd %r12d, %xmm4
+; SSE2-SSSE3-NEXT: shlq $60, %r13
+; SSE2-SSSE3-NEXT: sarq $63, %r13
+; SSE2-SSSE3-NEXT: movd %r13d, %xmm10
+; SSE2-SSSE3-NEXT: shlq $61, %rbx
+; SSE2-SSSE3-NEXT: sarq $63, %rbx
+; SSE2-SSSE3-NEXT: movd %ebx, %xmm7
+; SSE2-SSSE3-NEXT: shlq $62, %rax
+; SSE2-SSSE3-NEXT: sarq $63, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm11
+; SSE2-SSSE3-NEXT: shlq $63, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: shlq $58, %rdx
+; SSE2-SSSE3-NEXT: sarq $63, %rdx
+; SSE2-SSSE3-NEXT: movd %edx, %xmm12
+; SSE2-SSSE3-NEXT: shlq $59, %rdi
+; SSE2-SSSE3-NEXT: sarq $63, %rdi
+; SSE2-SSSE3-NEXT: movd %edi, %xmm5
+; SSE2-SSSE3-NEXT: shlq $57, %rbp
+; SSE2-SSSE3-NEXT: sarq $63, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm1
+; SSE2-SSSE3-NEXT: shrq $7, %rsi
+; SSE2-SSSE3-NEXT: movd %esi, %xmm14
+; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rsi
+; SSE2-SSSE3-NEXT: movq %rsi, %r8
+; SSE2-SSSE3-NEXT: movq %rsi, %r9
+; SSE2-SSSE3-NEXT: movq %rsi, %r10
+; SSE2-SSSE3-NEXT: movq %rsi, %r11
+; SSE2-SSSE3-NEXT: movq %rsi, %r14
+; SSE2-SSSE3-NEXT: movq %rsi, %r15
+; SSE2-SSSE3-NEXT: movq %rsi, %r12
+; SSE2-SSSE3-NEXT: movq %rsi, %r13
+; SSE2-SSSE3-NEXT: movq %rsi, %rbx
+; SSE2-SSSE3-NEXT: movq %rsi, %rax
+; SSE2-SSSE3-NEXT: movq %rsi, %rcx
+; SSE2-SSSE3-NEXT: movq %rsi, %rdx
+; SSE2-SSSE3-NEXT: movq %rsi, %rdi
+; SSE2-SSSE3-NEXT: movq %rsi, %rbp
+; SSE2-SSSE3-NEXT: shrq $15, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm6
+; SSE2-SSSE3-NEXT: movq %rsi, %rbp
+; SSE2-SSSE3-NEXT: movsbq %sil, %rsi
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; SSE2-SSSE3-NEXT: shlq $49, %r8
+; SSE2-SSSE3-NEXT: sarq $63, %r8
+; SSE2-SSSE3-NEXT: movd %r8d, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSE2-SSSE3-NEXT: shlq $50, %r9
+; SSE2-SSSE3-NEXT: sarq $63, %r9
+; SSE2-SSSE3-NEXT: movd %r9d, %xmm3
+; SSE2-SSSE3-NEXT: shlq $51, %r10
+; SSE2-SSSE3-NEXT: sarq $63, %r10
+; SSE2-SSSE3-NEXT: movd %r10d, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; SSE2-SSSE3-NEXT: shlq $52, %r11
+; SSE2-SSSE3-NEXT: sarq $63, %r11
+; SSE2-SSSE3-NEXT: movd %r11d, %xmm8
+; SSE2-SSSE3-NEXT: shlq $53, %r14
+; SSE2-SSSE3-NEXT: sarq $63, %r14
+; SSE2-SSSE3-NEXT: movd %r14d, %xmm13
+; SSE2-SSSE3-NEXT: shlq $54, %r15
+; SSE2-SSSE3-NEXT: sarq $63, %r15
+; SSE2-SSSE3-NEXT: movd %r15d, %xmm9
+; SSE2-SSSE3-NEXT: shlq $55, %r12
+; SSE2-SSSE3-NEXT: sarq $63, %r12
+; SSE2-SSSE3-NEXT: movd %r12d, %xmm1
+; SSE2-SSSE3-NEXT: shlq $60, %r13
+; SSE2-SSSE3-NEXT: sarq $63, %r13
+; SSE2-SSSE3-NEXT: movd %r13d, %xmm10
+; SSE2-SSSE3-NEXT: shlq $61, %rbx
+; SSE2-SSSE3-NEXT: sarq $63, %rbx
+; SSE2-SSSE3-NEXT: movd %ebx, %xmm15
+; SSE2-SSSE3-NEXT: shlq $62, %rax
+; SSE2-SSSE3-NEXT: sarq $63, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm11
+; SSE2-SSSE3-NEXT: shlq $63, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: shlq $58, %rdx
+; SSE2-SSSE3-NEXT: sarq $63, %rdx
+; SSE2-SSSE3-NEXT: movd %edx, %xmm12
+; SSE2-SSSE3-NEXT: shlq $59, %rdi
+; SSE2-SSSE3-NEXT: sarq $63, %rdi
+; SSE2-SSSE3-NEXT: movd %edi, %xmm5
+; SSE2-SSSE3-NEXT: shlq $57, %rbp
+; SSE2-SSSE3-NEXT: sarq $63, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm6
+; SSE2-SSSE3-NEXT: shrq $7, %rsi
+; SSE2-SSSE3-NEXT: movd %esi, %xmm14
+; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rsi
+; SSE2-SSSE3-NEXT: movq %rsi, %r8
+; SSE2-SSSE3-NEXT: movq %rsi, %r9
+; SSE2-SSSE3-NEXT: movq %rsi, %r10
+; SSE2-SSSE3-NEXT: movq %rsi, %r11
+; SSE2-SSSE3-NEXT: movq %rsi, %r14
+; SSE2-SSSE3-NEXT: movq %rsi, %r15
+; SSE2-SSSE3-NEXT: movq %rsi, %r12
+; SSE2-SSSE3-NEXT: movq %rsi, %r13
+; SSE2-SSSE3-NEXT: movq %rsi, %rbx
+; SSE2-SSSE3-NEXT: movq %rsi, %rax
+; SSE2-SSSE3-NEXT: movq %rsi, %rcx
+; SSE2-SSSE3-NEXT: movq %rsi, %rdx
+; SSE2-SSSE3-NEXT: movq %rsi, %rdi
+; SSE2-SSSE3-NEXT: movq %rsi, %rbp
+; SSE2-SSSE3-NEXT: shrq $15, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm7
+; SSE2-SSSE3-NEXT: movq %rsi, %rbp
+; SSE2-SSSE3-NEXT: movsbq %sil, %rsi
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7]
+; SSE2-SSSE3-NEXT: shlq $49, %r8
+; SSE2-SSSE3-NEXT: sarq $63, %r8
+; SSE2-SSSE3-NEXT: movd %r8d, %xmm4
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; SSE2-SSSE3-NEXT: shlq $50, %r9
+; SSE2-SSSE3-NEXT: sarq $63, %r9
+; SSE2-SSSE3-NEXT: movd %r9d, %xmm6
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; SSE2-SSSE3-NEXT: shlq $51, %r10
+; SSE2-SSSE3-NEXT: sarq $63, %r10
+; SSE2-SSSE3-NEXT: movd %r10d, %xmm5
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; SSE2-SSSE3-NEXT: shlq $52, %r11
+; SSE2-SSSE3-NEXT: sarq $63, %r11
+; SSE2-SSSE3-NEXT: movd %r11d, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
+; SSE2-SSSE3-NEXT: shlq $53, %r14
+; SSE2-SSSE3-NEXT: sarq $63, %r14
+; SSE2-SSSE3-NEXT: movd %r14d, %xmm7
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; SSE2-SSSE3-NEXT: shlq $54, %r15
+; SSE2-SSSE3-NEXT: sarq $63, %r15
+; SSE2-SSSE3-NEXT: movd %r15d, %xmm6
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE2-SSSE3-NEXT: shlq $55, %r12
+; SSE2-SSSE3-NEXT: sarq $63, %r12
+; SSE2-SSSE3-NEXT: movd %r12d, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
+; SSE2-SSSE3-NEXT: shlq $60, %r13
+; SSE2-SSSE3-NEXT: sarq $63, %r13
+; SSE2-SSSE3-NEXT: movd %r13d, %xmm8
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; SSE2-SSSE3-NEXT: shlq $61, %rbx
+; SSE2-SSSE3-NEXT: sarq $63, %rbx
+; SSE2-SSSE3-NEXT: movd %ebx, %xmm6
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
+; SSE2-SSSE3-NEXT: shlq $62, %rax
+; SSE2-SSSE3-NEXT: sarq $63, %rax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm7
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE2-SSSE3-NEXT: shlq $63, %rcx
+; SSE2-SSSE3-NEXT: sarq $63, %rcx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
+; SSE2-SSSE3-NEXT: shlq $58, %rdx
+; SSE2-SSSE3-NEXT: sarq $63, %rdx
+; SSE2-SSSE3-NEXT: movd %edx, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
+; SSE2-SSSE3-NEXT: shlq $59, %rdi
+; SSE2-SSSE3-NEXT: sarq $63, %rdi
+; SSE2-SSSE3-NEXT: movd %edi, %xmm7
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSE2-SSSE3-NEXT: shlq $57, %rbp
+; SSE2-SSSE3-NEXT: sarq $63, %rbp
+; SSE2-SSSE3-NEXT: movd %ebp, %xmm5
+; SSE2-SSSE3-NEXT: shrq $7, %rsi
+; SSE2-SSSE3-NEXT: movd %esi, %xmm6
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; SSE2-SSSE3-NEXT: popq %rbx
+; SSE2-SSSE3-NEXT: popq %r12
+; SSE2-SSSE3-NEXT: popq %r13
+; SSE2-SSSE3-NEXT: popq %r14
+; SSE2-SSSE3-NEXT: popq %r15
+; SSE2-SSSE3-NEXT: popq %rbp
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i64_64i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi28:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi29:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi30:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: pushq %r15
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: pushq %r13
+; AVX1-NEXT: pushq %r12
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $128, %rsp
+; AVX1-NEXT: .Lcfi31:
+; AVX1-NEXT: .cfi_offset %rbx, -56
+; AVX1-NEXT: .Lcfi32:
+; AVX1-NEXT: .cfi_offset %r12, -48
+; AVX1-NEXT: .Lcfi33:
+; AVX1-NEXT: .cfi_offset %r13, -40
+; AVX1-NEXT: .Lcfi34:
+; AVX1-NEXT: .cfi_offset %r14, -32
+; AVX1-NEXT: .Lcfi35:
+; AVX1-NEXT: .cfi_offset %r15, -24
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: shrq $32, %rdi
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rdx
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: shlq $47, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: movq %rdx, %rdi
+; AVX1-NEXT: movq %rdx, %r13
+; AVX1-NEXT: movq %rdx, %rsi
+; AVX1-NEXT: movq %rdx, %r10
+; AVX1-NEXT: movq %rdx, %r11
+; AVX1-NEXT: movq %rdx, %r9
+; AVX1-NEXT: movq %rdx, %rbx
+; AVX1-NEXT: movq %rdx, %r14
+; AVX1-NEXT: movq %rdx, %r15
+; AVX1-NEXT: movq %rdx, %r12
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shlq $46, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX1-NEXT: shlq $45, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: shlq $44, %r8
+; AVX1-NEXT: sarq $63, %r8
+; AVX1-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: shlq $43, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: shlq $42, %rdi
+; AVX1-NEXT: sarq $63, %rdi
+; AVX1-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %rdi
+; AVX1-NEXT: shlq $41, %r13
+; AVX1-NEXT: sarq $63, %r13
+; AVX1-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r13
+; AVX1-NEXT: shlq $40, %rsi
+; AVX1-NEXT: sarq $63, %rsi
+; AVX1-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %rsi
+; AVX1-NEXT: shlq $39, %r10
+; AVX1-NEXT: sarq $63, %r10
+; AVX1-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r10
+; AVX1-NEXT: shlq $38, %r11
+; AVX1-NEXT: sarq $63, %r11
+; AVX1-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0
+; AVX1-NEXT: movsbq %dl, %rax
+; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: shlq $37, %r9
+; AVX1-NEXT: sarq $63, %r9
+; AVX1-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r9
+; AVX1-NEXT: shlq $36, %rbx
+; AVX1-NEXT: sarq $63, %rbx
+; AVX1-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %rbx
+; AVX1-NEXT: shlq $35, %r14
+; AVX1-NEXT: sarq $63, %r14
+; AVX1-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r14
+; AVX1-NEXT: shlq $34, %r15
+; AVX1-NEXT: sarq $63, %r15
+; AVX1-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r15
+; AVX1-NEXT: shlq $33, %r12
+; AVX1-NEXT: sarq $63, %r12
+; AVX1-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %r12
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX1-NEXT: shrq $31, %rax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shlq $63, %r8
+; AVX1-NEXT: sarq $63, %r8
+; AVX1-NEXT: vmovd %r8d, %xmm1
+; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: movswq %dx, %rdx
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11 # 8-byte Reload
+; AVX1-NEXT: shlq $62, %r11
+; AVX1-NEXT: sarq $63, %r11
+; AVX1-NEXT: vpinsrb $1, %r11d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $61, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $60, %rdi
+; AVX1-NEXT: sarq $63, %rdi
+; AVX1-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1
+; AVX1-NEXT: shlq $59, %r13
+; AVX1-NEXT: sarq $63, %r13
+; AVX1-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $58, %rsi
+; AVX1-NEXT: sarq $63, %rsi
+; AVX1-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1
+; AVX1-NEXT: shlq $57, %r10
+; AVX1-NEXT: sarq $63, %r10
+; AVX1-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; AVX1-NEXT: shrq $7, %rcx
+; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $55, %r9
+; AVX1-NEXT: sarq $63, %r9
+; AVX1-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $54, %rbx
+; AVX1-NEXT: sarq $63, %rbx
+; AVX1-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $53, %r14
+; AVX1-NEXT: sarq $63, %r14
+; AVX1-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $52, %r15
+; AVX1-NEXT: sarq $63, %r15
+; AVX1-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $51, %r12
+; AVX1-NEXT: sarq $63, %r12
+; AVX1-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $50, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX1-NEXT: shlq $49, %r8
+; AVX1-NEXT: sarq $63, %r8
+; AVX1-NEXT: vpinsrb $14, %r8d, %xmm1, %xmm1
+; AVX1-NEXT: shrq $15, %rdx
+; AVX1-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rdx
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: shlq $47, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vmovd %ecx, %xmm2
+; AVX1-NEXT: movq %rdx, %r13
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: movq %rdx, %r9
+; AVX1-NEXT: movq %rdx, %r12
+; AVX1-NEXT: movq %rdx, %rdi
+; AVX1-NEXT: movq %rdx, %rbx
+; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: movq %rdx, %r10
+; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: movq %rdx, %rsi
+; AVX1-NEXT: movq %rdx, %r11
+; AVX1-NEXT: movq %rdx, %r14
+; AVX1-NEXT: movq %rdx, %r15
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shlq $46, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: shlq $45, %r13
+; AVX1-NEXT: sarq $63, %r13
+; AVX1-NEXT: vpinsrb $2, %r13d, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %r13
+; AVX1-NEXT: shlq $44, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: shlq $43, %r9
+; AVX1-NEXT: sarq $63, %r9
+; AVX1-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %r9
+; AVX1-NEXT: shlq $42, %r12
+; AVX1-NEXT: sarq $63, %r12
+; AVX1-NEXT: vpinsrb $5, %r12d, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %r12
+; AVX1-NEXT: shlq $41, %rdi
+; AVX1-NEXT: sarq $63, %rdi
+; AVX1-NEXT: vpinsrb $6, %edi, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %rdi
+; AVX1-NEXT: shlq $40, %rbx
+; AVX1-NEXT: sarq $63, %rbx
+; AVX1-NEXT: vpinsrb $7, %ebx, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %rbx
+; AVX1-NEXT: shlq $39, %r8
+; AVX1-NEXT: sarq $63, %r8
+; AVX1-NEXT: vpinsrb $8, %r8d, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: shlq $38, %r10
+; AVX1-NEXT: sarq $63, %r10
+; AVX1-NEXT: vpinsrb $9, %r10d, %xmm2, %xmm2
+; AVX1-NEXT: movsbq %dl, %rax
+; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX1-NEXT: shlq $37, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %r10
+; AVX1-NEXT: shlq $36, %rsi
+; AVX1-NEXT: sarq $63, %rsi
+; AVX1-NEXT: vpinsrb $11, %esi, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %rsi
+; AVX1-NEXT: shlq $35, %r11
+; AVX1-NEXT: sarq $63, %r11
+; AVX1-NEXT: vpinsrb $12, %r11d, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %r11
+; AVX1-NEXT: shlq $34, %r14
+; AVX1-NEXT: sarq $63, %r14
+; AVX1-NEXT: vpinsrb $13, %r14d, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %r14
+; AVX1-NEXT: shlq $33, %r15
+; AVX1-NEXT: sarq $63, %r15
+; AVX1-NEXT: vpinsrb $14, %r15d, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %r15
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX1-NEXT: shrq $31, %rax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shlq $63, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vmovd %ecx, %xmm3
+; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: movswq %dx, %rdx
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: shlq $62, %r13
+; AVX1-NEXT: sarq $63, %r13
+; AVX1-NEXT: vpinsrb $1, %r13d, %xmm3, %xmm1
+; AVX1-NEXT: shlq $61, %r9
+; AVX1-NEXT: sarq $63, %r9
+; AVX1-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $60, %r12
+; AVX1-NEXT: sarq $63, %r12
+; AVX1-NEXT: vpinsrb $3, %r12d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $59, %rdi
+; AVX1-NEXT: sarq $63, %rdi
+; AVX1-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
+; AVX1-NEXT: shlq $58, %rbx
+; AVX1-NEXT: sarq $63, %rbx
+; AVX1-NEXT: vpinsrb $5, %ebx, %xmm1, %xmm1
+; AVX1-NEXT: shlq $57, %r8
+; AVX1-NEXT: sarq $63, %r8
+; AVX1-NEXT: vpinsrb $6, %r8d, %xmm1, %xmm1
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; AVX1-NEXT: shrq $7, %rdi
+; AVX1-NEXT: vpinsrb $7, %edi, %xmm1, %xmm1
+; AVX1-NEXT: shlq $55, %r10
+; AVX1-NEXT: sarq $63, %r10
+; AVX1-NEXT: vpinsrb $8, %r10d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $54, %rsi
+; AVX1-NEXT: sarq $63, %rsi
+; AVX1-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1
+; AVX1-NEXT: shlq $53, %r11
+; AVX1-NEXT: sarq $63, %r11
+; AVX1-NEXT: vpinsrb $10, %r11d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $52, %r14
+; AVX1-NEXT: sarq $63, %r14
+; AVX1-NEXT: vpinsrb $11, %r14d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $51, %r15
+; AVX1-NEXT: sarq $63, %r15
+; AVX1-NEXT: vpinsrb $12, %r15d, %xmm1, %xmm1
+; AVX1-NEXT: shlq $50, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX1-NEXT: shlq $49, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: shrq $15, %rdx
+; AVX1-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: leaq -40(%rbp), %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r12
+; AVX1-NEXT: popq %r13
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i64_64i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Lcfi28:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .Lcfi29:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .Lcfi30:
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $128, %rsp
+; AVX2-NEXT: .Lcfi31:
+; AVX2-NEXT: .cfi_offset %rbx, -56
+; AVX2-NEXT: .Lcfi32:
+; AVX2-NEXT: .cfi_offset %r12, -48
+; AVX2-NEXT: .Lcfi33:
+; AVX2-NEXT: .cfi_offset %r13, -40
+; AVX2-NEXT: .Lcfi34:
+; AVX2-NEXT: .cfi_offset %r14, -32
+; AVX2-NEXT: .Lcfi35:
+; AVX2-NEXT: .cfi_offset %r15, -24
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: shrq $32, %rdi
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: shlq $47, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: movq %rdx, %rdi
+; AVX2-NEXT: movq %rdx, %r13
+; AVX2-NEXT: movq %rdx, %rsi
+; AVX2-NEXT: movq %rdx, %r10
+; AVX2-NEXT: movq %rdx, %r11
+; AVX2-NEXT: movq %rdx, %r9
+; AVX2-NEXT: movq %rdx, %rbx
+; AVX2-NEXT: movq %rdx, %r14
+; AVX2-NEXT: movq %rdx, %r15
+; AVX2-NEXT: movq %rdx, %r12
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shlq $46, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX2-NEXT: shlq $45, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: shlq $44, %r8
+; AVX2-NEXT: sarq $63, %r8
+; AVX2-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: shlq $43, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: shlq $42, %rdi
+; AVX2-NEXT: sarq $63, %rdi
+; AVX2-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %rdi
+; AVX2-NEXT: shlq $41, %r13
+; AVX2-NEXT: sarq $63, %r13
+; AVX2-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r13
+; AVX2-NEXT: shlq $40, %rsi
+; AVX2-NEXT: sarq $63, %rsi
+; AVX2-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %rsi
+; AVX2-NEXT: shlq $39, %r10
+; AVX2-NEXT: sarq $63, %r10
+; AVX2-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r10
+; AVX2-NEXT: shlq $38, %r11
+; AVX2-NEXT: sarq $63, %r11
+; AVX2-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0
+; AVX2-NEXT: movsbq %dl, %rax
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: shlq $37, %r9
+; AVX2-NEXT: sarq $63, %r9
+; AVX2-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r9
+; AVX2-NEXT: shlq $36, %rbx
+; AVX2-NEXT: sarq $63, %rbx
+; AVX2-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %rbx
+; AVX2-NEXT: shlq $35, %r14
+; AVX2-NEXT: sarq $63, %r14
+; AVX2-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r14
+; AVX2-NEXT: shlq $34, %r15
+; AVX2-NEXT: sarq $63, %r15
+; AVX2-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r15
+; AVX2-NEXT: shlq $33, %r12
+; AVX2-NEXT: sarq $63, %r12
+; AVX2-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %r12
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX2-NEXT: shrq $31, %rax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shlq $63, %r8
+; AVX2-NEXT: sarq $63, %r8
+; AVX2-NEXT: vmovd %r8d, %xmm1
+; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: movswq %dx, %rdx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 # 8-byte Reload
+; AVX2-NEXT: shlq $62, %r11
+; AVX2-NEXT: sarq $63, %r11
+; AVX2-NEXT: vpinsrb $1, %r11d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $61, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $60, %rdi
+; AVX2-NEXT: sarq $63, %rdi
+; AVX2-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1
+; AVX2-NEXT: shlq $59, %r13
+; AVX2-NEXT: sarq $63, %r13
+; AVX2-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $58, %rsi
+; AVX2-NEXT: sarq $63, %rsi
+; AVX2-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1
+; AVX2-NEXT: shlq $57, %r10
+; AVX2-NEXT: sarq $63, %r10
+; AVX2-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; AVX2-NEXT: shrq $7, %rcx
+; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $55, %r9
+; AVX2-NEXT: sarq $63, %r9
+; AVX2-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $54, %rbx
+; AVX2-NEXT: sarq $63, %rbx
+; AVX2-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $53, %r14
+; AVX2-NEXT: sarq $63, %r14
+; AVX2-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $52, %r15
+; AVX2-NEXT: sarq $63, %r15
+; AVX2-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $51, %r12
+; AVX2-NEXT: sarq $63, %r12
+; AVX2-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $50, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX2-NEXT: shlq $49, %r8
+; AVX2-NEXT: sarq $63, %r8
+; AVX2-NEXT: vpinsrb $14, %r8d, %xmm1, %xmm1
+; AVX2-NEXT: shrq $15, %rdx
+; AVX2-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: shlq $47, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vmovd %ecx, %xmm2
+; AVX2-NEXT: movq %rdx, %r13
+; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: movq %rdx, %r9
+; AVX2-NEXT: movq %rdx, %r12
+; AVX2-NEXT: movq %rdx, %rdi
+; AVX2-NEXT: movq %rdx, %rbx
+; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: movq %rdx, %r10
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: movq %rdx, %rsi
+; AVX2-NEXT: movq %rdx, %r11
+; AVX2-NEXT: movq %rdx, %r14
+; AVX2-NEXT: movq %rdx, %r15
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shlq $46, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: shlq $45, %r13
+; AVX2-NEXT: sarq $63, %r13
+; AVX2-NEXT: vpinsrb $2, %r13d, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %r13
+; AVX2-NEXT: shlq $44, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: shlq $43, %r9
+; AVX2-NEXT: sarq $63, %r9
+; AVX2-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %r9
+; AVX2-NEXT: shlq $42, %r12
+; AVX2-NEXT: sarq $63, %r12
+; AVX2-NEXT: vpinsrb $5, %r12d, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %r12
+; AVX2-NEXT: shlq $41, %rdi
+; AVX2-NEXT: sarq $63, %rdi
+; AVX2-NEXT: vpinsrb $6, %edi, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %rdi
+; AVX2-NEXT: shlq $40, %rbx
+; AVX2-NEXT: sarq $63, %rbx
+; AVX2-NEXT: vpinsrb $7, %ebx, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %rbx
+; AVX2-NEXT: shlq $39, %r8
+; AVX2-NEXT: sarq $63, %r8
+; AVX2-NEXT: vpinsrb $8, %r8d, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: shlq $38, %r10
+; AVX2-NEXT: sarq $63, %r10
+; AVX2-NEXT: vpinsrb $9, %r10d, %xmm2, %xmm2
+; AVX2-NEXT: movsbq %dl, %rax
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX2-NEXT: shlq $37, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %r10
+; AVX2-NEXT: shlq $36, %rsi
+; AVX2-NEXT: sarq $63, %rsi
+; AVX2-NEXT: vpinsrb $11, %esi, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %rsi
+; AVX2-NEXT: shlq $35, %r11
+; AVX2-NEXT: sarq $63, %r11
+; AVX2-NEXT: vpinsrb $12, %r11d, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %r11
+; AVX2-NEXT: shlq $34, %r14
+; AVX2-NEXT: sarq $63, %r14
+; AVX2-NEXT: vpinsrb $13, %r14d, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %r14
+; AVX2-NEXT: shlq $33, %r15
+; AVX2-NEXT: sarq $63, %r15
+; AVX2-NEXT: vpinsrb $14, %r15d, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %r15
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; AVX2-NEXT: shrq $31, %rax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shlq $63, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vmovd %ecx, %xmm3
+; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: movswq %dx, %rdx
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: shlq $62, %r13
+; AVX2-NEXT: sarq $63, %r13
+; AVX2-NEXT: vpinsrb $1, %r13d, %xmm3, %xmm1
+; AVX2-NEXT: shlq $61, %r9
+; AVX2-NEXT: sarq $63, %r9
+; AVX2-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $60, %r12
+; AVX2-NEXT: sarq $63, %r12
+; AVX2-NEXT: vpinsrb $3, %r12d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $59, %rdi
+; AVX2-NEXT: sarq $63, %rdi
+; AVX2-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
+; AVX2-NEXT: shlq $58, %rbx
+; AVX2-NEXT: sarq $63, %rbx
+; AVX2-NEXT: vpinsrb $5, %ebx, %xmm1, %xmm1
+; AVX2-NEXT: shlq $57, %r8
+; AVX2-NEXT: sarq $63, %r8
+; AVX2-NEXT: vpinsrb $6, %r8d, %xmm1, %xmm1
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; AVX2-NEXT: shrq $7, %rdi
+; AVX2-NEXT: vpinsrb $7, %edi, %xmm1, %xmm1
+; AVX2-NEXT: shlq $55, %r10
+; AVX2-NEXT: sarq $63, %r10
+; AVX2-NEXT: vpinsrb $8, %r10d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $54, %rsi
+; AVX2-NEXT: sarq $63, %rsi
+; AVX2-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1
+; AVX2-NEXT: shlq $53, %r11
+; AVX2-NEXT: sarq $63, %r11
+; AVX2-NEXT: vpinsrb $10, %r11d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $52, %r14
+; AVX2-NEXT: sarq $63, %r14
+; AVX2-NEXT: vpinsrb $11, %r14d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $51, %r15
+; AVX2-NEXT: sarq $63, %r15
+; AVX2-NEXT: vpinsrb $12, %r15d, %xmm1, %xmm1
+; AVX2-NEXT: shlq $50, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX2-NEXT: shlq $49, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: shrq $15, %rdx
+; AVX2-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: leaq -40(%rbp), %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i64_64i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovq %rdi, %k0
+; AVX512-NEXT: vpmovm2b %k0, %zmm0
+; AVX512-NEXT: retq
+ %1 = bitcast i64 %a0 to <64 x i1>
+ %2 = sext <64 x i1> %1 to <64 x i8>
+ ret <64 x i8> %2
+}
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
new file mode 100644
index 000000000000..aa9e60df1404
--- /dev/null
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -0,0 +1,3279 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512
+
+;
+; 128-bit vectors
+;
+
+define <2 x i64> @ext_i2_2i64(i2 %a0) {
+; SSE2-SSSE3-LABEL: ext_i2_2i64:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: andb $3, %dil
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movq %rcx, %xmm0
+; SSE2-SSSE3-NEXT: shrl %eax
+; SSE2-SSSE3-NEXT: andl $1, %eax
+; SSE2-SSSE3-NEXT: movq %rax, %xmm1
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: ext_i2_2i64:
+; AVX12: # BB#0:
+; AVX12-NEXT: andb $3, %dil
+; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vmovq %rcx, %xmm0
+; AVX12-NEXT: shrl %eax
+; AVX12-NEXT: andl $1, %eax
+; AVX12-NEXT: vmovq %rax, %xmm1
+; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: ext_i2_2i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: andb $3, %dil
+; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %1 = bitcast i2 %a0 to <2 x i1>
+ %2 = zext <2 x i1> %1 to <2 x i64>
+ ret <2 x i64> %2
+}
+
+define <4 x i32> @ext_i4_4i32(i4 %a0) {
+; SSE2-SSSE3-LABEL: ext_i4_4i32:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: andb $15, %dil
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: shrl %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm2
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i4_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: andb $15, %dil
+; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl %ecx
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $3, %eax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i4_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: andb $15, %dil
+; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl %ecx
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $3, %eax
+; AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i4_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: andb $15, %dil
+; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %1 = bitcast i4 %a0 to <4 x i1>
+ %2 = zext <4 x i1> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @ext_i8_8i16(i8 %a0) {
+; SSE2-SSSE3-LABEL: ext_i8_8i16:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: shrl $7, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm3
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: ext_i8_8i16:
+; AVX12: # BB#0:
+; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: movl %eax, %edx
+; AVX12-NEXT: andl $1, %edx
+; AVX12-NEXT: vmovd %edx, %xmm0
+; AVX12-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $2, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $3, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $4, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $5, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $6, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: shrl $7, %eax
+; AVX12-NEXT: movzwl %ax, %eax
+; AVX12-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: ext_i8_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k5
+; AVX512-NEXT: kshiftlw $8, %k5, %k0
+; AVX512-NEXT: kshiftrw $15, %k0, %k0
+; AVX512-NEXT: kshiftlw $9, %k5, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kshiftlw $10, %k5, %k2
+; AVX512-NEXT: kshiftrw $15, %k2, %k2
+; AVX512-NEXT: kshiftlw $11, %k5, %k3
+; AVX512-NEXT: kshiftrw $15, %k3, %k3
+; AVX512-NEXT: kshiftlw $12, %k5, %k4
+; AVX512-NEXT: kshiftrw $15, %k4, %k4
+; AVX512-NEXT: kshiftlw $13, %k5, %k6
+; AVX512-NEXT: kshiftrw $15, %k6, %k6
+; AVX512-NEXT: kshiftlw $15, %k5, %k7
+; AVX512-NEXT: kshiftrw $15, %k7, %k7
+; AVX512-NEXT: kshiftlw $14, %k5, %k5
+; AVX512-NEXT: kshiftrw $15, %k5, %k5
+; AVX512-NEXT: kmovd %k5, %eax
+; AVX512-NEXT: andl $1, %eax
+; AVX512-NEXT: kmovd %k7, %ecx
+; AVX512-NEXT: andl $1, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm0
+; AVX512-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kmovd %k6, %eax
+; AVX512-NEXT: andl $1, %eax
+; AVX512-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kmovd %k4, %eax
+; AVX512-NEXT: andl $1, %eax
+; AVX512-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kmovd %k3, %eax
+; AVX512-NEXT: andl $1, %eax
+; AVX512-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kmovd %k2, %eax
+; AVX512-NEXT: andl $1, %eax
+; AVX512-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kmovd %k1, %eax
+; AVX512-NEXT: andl $1, %eax
+; AVX512-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: andl $1, %eax
+; AVX512-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = bitcast i8 %a0 to <8 x i1>
+ %2 = zext <8 x i1> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @ext_i16_16i8(i16 %a0) {
+; SSE2-SSSE3-LABEL: ext_i16_16i8:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: ext_i16_16i8:
+; AVX12: # BB#0:
+; AVX12-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: movl %eax, %edx
+; AVX12-NEXT: andl $1, %edx
+; AVX12-NEXT: vmovd %edx, %xmm0
+; AVX12-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $2, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $3, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $4, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $5, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $6, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $7, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $8, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $9, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $10, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $11, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $12, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $13, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $14, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: shrl $15, %eax
+; AVX12-NEXT: movzwl %ax, %eax
+; AVX12-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: ext_i16_16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: .Lcfi0:
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: .Lcfi1:
+; AVX512-NEXT: .cfi_def_cfa_offset 24
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: .Lcfi2:
+; AVX512-NEXT: .cfi_def_cfa_offset 32
+; AVX512-NEXT: pushq %r13
+; AVX512-NEXT: .Lcfi3:
+; AVX512-NEXT: .cfi_def_cfa_offset 40
+; AVX512-NEXT: pushq %r12
+; AVX512-NEXT: .Lcfi4:
+; AVX512-NEXT: .cfi_def_cfa_offset 48
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: .Lcfi5:
+; AVX512-NEXT: .cfi_def_cfa_offset 56
+; AVX512-NEXT: .Lcfi6:
+; AVX512-NEXT: .cfi_offset %rbx, -56
+; AVX512-NEXT: .Lcfi7:
+; AVX512-NEXT: .cfi_offset %r12, -48
+; AVX512-NEXT: .Lcfi8:
+; AVX512-NEXT: .cfi_offset %r13, -40
+; AVX512-NEXT: .Lcfi9:
+; AVX512-NEXT: .cfi_offset %r14, -32
+; AVX512-NEXT: .Lcfi10:
+; AVX512-NEXT: .cfi_offset %r15, -24
+; AVX512-NEXT: .Lcfi11:
+; AVX512-NEXT: .cfi_offset %rbp, -16
+; AVX512-NEXT: kmovd %edi, %k0
+; AVX512-NEXT: kshiftlw $14, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %r8d
+; AVX512-NEXT: kshiftlw $15, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %r9d
+; AVX512-NEXT: kshiftlw $13, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %r10d
+; AVX512-NEXT: kshiftlw $12, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %r11d
+; AVX512-NEXT: kshiftlw $11, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %r14d
+; AVX512-NEXT: kshiftlw $10, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %r15d
+; AVX512-NEXT: kshiftlw $9, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %r12d
+; AVX512-NEXT: kshiftlw $8, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %r13d
+; AVX512-NEXT: kshiftlw $7, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %esi
+; AVX512-NEXT: kshiftlw $6, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %ebx
+; AVX512-NEXT: kshiftlw $5, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %ebp
+; AVX512-NEXT: kshiftlw $4, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %edi
+; AVX512-NEXT: kshiftlw $3, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %eax
+; AVX512-NEXT: kshiftlw $2, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %ecx
+; AVX512-NEXT: kshiftlw $1, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovd %k1, %edx
+; AVX512-NEXT: kshiftrw $15, %k0, %k0
+; AVX512-NEXT: vmovd %r9d, %xmm0
+; AVX512-NEXT: kmovd %k0, %r9d
+; AVX512-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $2, %r10d, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $14, %edx, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $15, %r9d, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r12
+; AVX512-NEXT: popq %r13
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: retq
+ %1 = bitcast i16 %a0 to <16 x i1>
+ %2 = zext <16 x i1> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; 256-bit vectors
+;
+
+define <4 x i64> @ext_i4_4i64(i4 %a0) {
+; SSE2-SSSE3-LABEL: ext_i4_4i64:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: andb $15, %dil
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-SSSE3-NEXT: movd %eax, %xmm2
+; SSE2-SSSE3-NEXT: shrl %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-SSSE3-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [1,1]
+; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
+; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i4_4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: andb $15, %dil
+; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $3, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovq %rcx, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: shrl %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i4_4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: andb $15, %dil
+; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $3, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovq %rcx, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: shrl %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i4_4i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: andb $15, %dil
+; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: retq
+ %1 = bitcast i4 %a0 to <4 x i1>
+ %2 = zext <4 x i1> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
+
+define <8 x i32> @ext_i8_8i32(i8 %a0) {
+; SSE2-SSSE3-LABEL: ext_i8_8i32:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: shrl $7, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm3
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i8_8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $5, %ecx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: shrl $4, %edx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $6, %ecx
+; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $7, %ecx
+; AVX1-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl %ecx
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: shrl $3, %eax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i8_8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $5, %ecx
+; AVX2-NEXT: movl %eax, %edx
+; AVX2-NEXT: shrl $4, %edx
+; AVX2-NEXT: vmovd %edx, %xmm0
+; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $6, %ecx
+; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $7, %ecx
+; AVX2-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl %ecx
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: shrl $3, %eax
+; AVX2-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i8_8i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %1 = bitcast i8 %a0 to <8 x i1>
+ %2 = zext <8 x i1> %1 to <8 x i32>
+ ret <8 x i32> %2
+}
+
+define <16 x i16> @ext_i16_16i16(i16 %a0) {
+; SSE2-SSSE3-LABEL: ext_i16_16i16:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i16_16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $9, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: shrl $8, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $10, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $11, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $12, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $13, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $14, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $15, %ecx
+; AVX1-NEXT: movzwl %cx, %ecx
+; AVX1-NEXT: vpinsrw $7, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: vmovd %edx, %xmm1
+; AVX1-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $3, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $4, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $5, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $6, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: shrl $7, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i16_16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $9, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: movl %eax, %edx
+; AVX2-NEXT: shrl $8, %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: vmovd %edx, %xmm0
+; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $10, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $11, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $12, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $13, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $14, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $15, %ecx
+; AVX2-NEXT: movzwl %cx, %ecx
+; AVX2-NEXT: vpinsrw $7, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: movl %eax, %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: vmovd %edx, %xmm1
+; AVX2-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $3, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $4, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $5, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $6, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; AVX2-NEXT: shrl $7, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i16_16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vmovdqu16 {{.*}}(%rip), %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %1 = bitcast i16 %a0 to <16 x i1>
+ %2 = zext <16 x i1> %1 to <16 x i16>
+ ret <16 x i16> %2
+}
+
+define <32 x i8> @ext_i32_32i8(i32 %a0) {
+; SSE2-SSSE3-LABEL: ext_i32_32i8:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: shrl $16, %edi
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i32_32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi0:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi1:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi2:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $32, %rsp
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $17, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movl %edi, %ecx
+; AVX1-NEXT: shrl $16, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $18, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $19, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $20, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $21, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $22, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $23, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $24, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $25, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $26, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $27, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $28, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $29, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $30, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $31, %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movl %edi, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm1
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $2, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $3, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $5, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $6, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $7, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $8, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $9, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $10, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $11, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $12, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $13, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $14, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX1-NEXT: shrl $15, %edi
+; AVX1-NEXT: andl $1, %edi
+; AVX1-NEXT: vpinsrb $15, %edi, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i32_32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Lcfi0:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .Lcfi1:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .Lcfi2:
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $32, %rsp
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $17, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: movl %edi, %ecx
+; AVX2-NEXT: shrl $16, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $18, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $19, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $20, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $21, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $22, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $23, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $24, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $25, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $26, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $27, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $28, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $29, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $30, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $31, %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: movl %edi, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm1
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $2, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $3, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $4, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $5, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $6, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $7, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $8, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $9, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $10, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $11, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $12, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $13, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $14, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX2-NEXT: shrl $15, %edi
+; AVX2-NEXT: andl $1, %edi
+; AVX2-NEXT: vpinsrb $15, %edi, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i32_32i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %1 = bitcast i32 %a0 to <32 x i1>
+ %2 = zext <32 x i1> %1 to <32 x i8>
+ ret <32 x i8> %2
+}
+
+;
+; 512-bit vectors
+;
+
+define <8 x i64> @ext_i8_8i64(i8 %a0) {
+; SSE2-SSSE3-LABEL: ext_i8_8i64:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: shrl $7, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3]
+; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,1]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3]
+; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3]
+; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3]
+; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i8_8i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $3, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $4, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $5, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $6, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $7, %eax
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm1
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,1,1,1]
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i8_8i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: movl %eax, %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: vmovd %edx, %xmm0
+; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $3, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $4, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $5, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $6, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $7, %eax
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm1
+; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i8_8i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %1 = bitcast i8 %a0 to <8 x i1>
+ %2 = zext <8 x i1> %1 to <8 x i64>
+ ret <8 x i64> %2
+}
+
+define <16 x i32> @ext_i16_16i32(i16 %a0) {
+; SSE2-SSSE3-LABEL: ext_i16_16i32:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i16_16i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $3, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $4, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $5, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $6, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $7, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $8, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $9, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $10, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $11, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $12, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $13, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $14, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $15, %eax
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i16_16i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: movl %eax, %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: vmovd %edx, %xmm0
+; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $3, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $4, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $5, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $6, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $7, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $8, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $9, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $10, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $11, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $12, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $13, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $14, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $15, %eax
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
+; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i16_16i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %1 = bitcast i16 %a0 to <16 x i1>
+ %2 = zext <16 x i1> %1 to <16 x i32>
+ ret <16 x i32> %2
+}
+
+define <32 x i16> @ext_i32_32i16(i32 %a0) {
+; SSE2-SSSE3-LABEL: ext_i32_32i16:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movl %edi, %eax
+; SSE2-SSSE3-NEXT: shrl $16, %eax
+; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i32_32i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi3:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi4:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi5:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: pushq %r15
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: pushq %r13
+; AVX1-NEXT: pushq %r12
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $128, %rsp
+; AVX1-NEXT: .Lcfi6:
+; AVX1-NEXT: .cfi_offset %rbx, -56
+; AVX1-NEXT: .Lcfi7:
+; AVX1-NEXT: .cfi_offset %r12, -48
+; AVX1-NEXT: .Lcfi8:
+; AVX1-NEXT: .cfi_offset %r13, -40
+; AVX1-NEXT: .Lcfi9:
+; AVX1-NEXT: .cfi_offset %r14, -32
+; AVX1-NEXT: .Lcfi10:
+; AVX1-NEXT: .cfi_offset %r15, -24
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX1-NEXT: movl %edi, %r13d
+; AVX1-NEXT: movl %edi, %r12d
+; AVX1-NEXT: movl %edi, %r15d
+; AVX1-NEXT: movl %edi, %r14d
+; AVX1-NEXT: movl %edi, %ebx
+; AVX1-NEXT: movl %edi, %r11d
+; AVX1-NEXT: movl %edi, %r10d
+; AVX1-NEXT: movl %edi, %r9d
+; AVX1-NEXT: movl %edi, %r8d
+; AVX1-NEXT: movl %edi, %esi
+; AVX1-NEXT: movl %edi, %edx
+; AVX1-NEXT: movl %edi, %ecx
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: andl $1, %edi
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: shrl %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $3, %edx
+; AVX1-NEXT: andl $1, %edx
+; AVX1-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $4, %esi
+; AVX1-NEXT: andl $1, %esi
+; AVX1-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; AVX1-NEXT: shrl $5, %r8d
+; AVX1-NEXT: andl $1, %r8d
+; AVX1-NEXT: vpinsrb $5, %r8d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $6, %r9d
+; AVX1-NEXT: andl $1, %r9d
+; AVX1-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $7, %r10d
+; AVX1-NEXT: andl $1, %r10d
+; AVX1-NEXT: vpinsrb $7, %r10d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $8, %r11d
+; AVX1-NEXT: andl $1, %r11d
+; AVX1-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $9, %ebx
+; AVX1-NEXT: andl $1, %ebx
+; AVX1-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $10, %r14d
+; AVX1-NEXT: andl $1, %r14d
+; AVX1-NEXT: vpinsrb $10, %r14d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $11, %r15d
+; AVX1-NEXT: andl $1, %r15d
+; AVX1-NEXT: vpinsrb $11, %r15d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $12, %r12d
+; AVX1-NEXT: andl $1, %r12d
+; AVX1-NEXT: vpinsrb $12, %r12d, %xmm0, %xmm0
+; AVX1-NEXT: shrl $13, %r13d
+; AVX1-NEXT: andl $1, %r13d
+; AVX1-NEXT: vpinsrb $13, %r13d, %xmm0, %xmm0
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $14, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $15, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $17, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $18, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $19, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $20, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $21, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $22, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $23, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $24, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $25, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $26, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $27, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $28, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $29, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $30, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX1-NEXT: shrl $31, %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: leaq -40(%rbp), %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r12
+; AVX1-NEXT: popq %r13
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i32_32i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Lcfi3:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .Lcfi4:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .Lcfi5:
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $128, %rsp
+; AVX2-NEXT: .Lcfi6:
+; AVX2-NEXT: .cfi_offset %rbx, -56
+; AVX2-NEXT: .Lcfi7:
+; AVX2-NEXT: .cfi_offset %r12, -48
+; AVX2-NEXT: .Lcfi8:
+; AVX2-NEXT: .cfi_offset %r13, -40
+; AVX2-NEXT: .Lcfi9:
+; AVX2-NEXT: .cfi_offset %r14, -32
+; AVX2-NEXT: .Lcfi10:
+; AVX2-NEXT: .cfi_offset %r15, -24
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
+; AVX2-NEXT: movl %edi, %r13d
+; AVX2-NEXT: movl %edi, %r12d
+; AVX2-NEXT: movl %edi, %r15d
+; AVX2-NEXT: movl %edi, %r14d
+; AVX2-NEXT: movl %edi, %ebx
+; AVX2-NEXT: movl %edi, %r11d
+; AVX2-NEXT: movl %edi, %r10d
+; AVX2-NEXT: movl %edi, %r9d
+; AVX2-NEXT: movl %edi, %r8d
+; AVX2-NEXT: movl %edi, %esi
+; AVX2-NEXT: movl %edi, %edx
+; AVX2-NEXT: movl %edi, %ecx
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: andl $1, %edi
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: shrl %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $3, %edx
+; AVX2-NEXT: andl $1, %edx
+; AVX2-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $4, %esi
+; AVX2-NEXT: andl $1, %esi
+; AVX2-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; AVX2-NEXT: shrl $5, %r8d
+; AVX2-NEXT: andl $1, %r8d
+; AVX2-NEXT: vpinsrb $5, %r8d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $6, %r9d
+; AVX2-NEXT: andl $1, %r9d
+; AVX2-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $7, %r10d
+; AVX2-NEXT: andl $1, %r10d
+; AVX2-NEXT: vpinsrb $7, %r10d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $8, %r11d
+; AVX2-NEXT: andl $1, %r11d
+; AVX2-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $9, %ebx
+; AVX2-NEXT: andl $1, %ebx
+; AVX2-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $10, %r14d
+; AVX2-NEXT: andl $1, %r14d
+; AVX2-NEXT: vpinsrb $10, %r14d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $11, %r15d
+; AVX2-NEXT: andl $1, %r15d
+; AVX2-NEXT: vpinsrb $11, %r15d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $12, %r12d
+; AVX2-NEXT: andl $1, %r12d
+; AVX2-NEXT: vpinsrb $12, %r12d, %xmm0, %xmm0
+; AVX2-NEXT: shrl $13, %r13d
+; AVX2-NEXT: andl $1, %r13d
+; AVX2-NEXT: vpinsrb $13, %r13d, %xmm0, %xmm0
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $14, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $15, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $17, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $18, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $19, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $20, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $21, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $22, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $23, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $24, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $25, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $26, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $27, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $28, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $29, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $30, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; AVX2-NEXT: shrl $31, %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: leaq -40(%rbp), %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i32_32i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %1 = bitcast i32 %a0 to <32 x i1>
+ %2 = zext <32 x i1> %1 to <32 x i16>
+ ret <32 x i16> %2
+}
+
+define <64 x i8> @ext_i64_64i8(i64 %a0) {
+; SSE2-SSSE3-LABEL: ext_i64_64i8:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movq %rdi, %rax
+; SSE2-SSSE3-NEXT: shrq $32, %rax
+; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movq %rdi, %rax
+; SSE2-SSSE3-NEXT: shrq $48, %rax
+; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: shrl $16, %edi
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm6
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm6
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm6
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm6
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm7
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i64_64i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi11:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi12:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi13:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $17, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movl %edi, %ecx
+; AVX1-NEXT: shrl $16, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $18, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $19, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $20, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $21, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $22, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $23, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $24, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $25, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $26, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $27, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $28, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $29, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $30, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $31, %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movl %edi, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm1
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $2, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $3, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $5, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $6, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $7, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $8, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $9, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $10, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $11, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $12, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $13, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $14, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $15, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $49, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movq %rdi, %rcx
+; AVX1-NEXT: shrq $48, %rcx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm1
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $50, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $51, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $52, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $53, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $54, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $55, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $56, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $57, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $58, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $59, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $60, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $61, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $62, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $63, %rax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $33, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movq %rdi, %rcx
+; AVX1-NEXT: shrq $32, %rcx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm2
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $34, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $35, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $36, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $37, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $38, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $39, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $40, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $41, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $42, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $43, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $44, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $45, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $46, %rax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX1-NEXT: shrq $47, %rdi
+; AVX1-NEXT: andl $1, %edi
+; AVX1-NEXT: vpinsrb $15, %edi, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i64_64i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Lcfi11:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .Lcfi12:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .Lcfi13:
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $17, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: movl %edi, %ecx
+; AVX2-NEXT: shrl $16, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $18, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $19, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $20, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $21, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $22, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $23, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $24, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $25, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $26, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $27, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $28, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $29, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $30, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $31, %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: movl %edi, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm1
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $2, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $3, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $4, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $5, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $6, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $7, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $8, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $9, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $10, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $11, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $12, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $13, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $14, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $15, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $49, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: movq %rdi, %rcx
+; AVX2-NEXT: shrq $48, %rcx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm1
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $50, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $51, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $52, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $53, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $54, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $55, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $56, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $57, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $58, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $59, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $60, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $61, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $62, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $63, %rax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $33, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: movq %rdi, %rcx
+; AVX2-NEXT: shrq $32, %rcx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm2
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $34, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $35, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $36, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $37, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $38, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $39, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $40, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $41, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $42, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $43, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $44, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $45, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $46, %rax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX2-NEXT: shrq $47, %rdi
+; AVX2-NEXT: andl $1, %edi
+; AVX2-NEXT: vpinsrb $15, %edi, %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ext_i64_64i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovq %rdi, %k1
+; AVX512-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512-NEXT: retq
+ %1 = bitcast i64 %a0 to <64 x i1>
+ %2 = zext <64 x i1> %1 to <64 x i8>
+ ret <64 x i8> %2
+}
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
new file mode 100644
index 000000000000..a190e0575522
--- /dev/null
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
@@ -0,0 +1,685 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512
+
+define <2 x i1> @bitcast_i2_2i1(i2 zeroext %a0) {
+; SSE2-SSSE3-LABEL: bitcast_i2_2i1:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movq %rcx, %xmm0
+; SSE2-SSSE3-NEXT: shrl %eax
+; SSE2-SSSE3-NEXT: andl $1, %eax
+; SSE2-SSSE3-NEXT: movq %rax, %xmm1
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: bitcast_i2_2i1:
+; AVX12: # BB#0:
+; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vmovq %rcx, %xmm0
+; AVX12-NEXT: shrl %eax
+; AVX12-NEXT: andl $1, %eax
+; AVX12-NEXT: vmovq %rax, %xmm1
+; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: bitcast_i2_2i1:
+; AVX512: # BB#0:
+; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %1 = bitcast i2 %a0 to <2 x i1>
+ ret <2 x i1> %1
+}
+
+define <4 x i1> @bitcast_i4_4i1(i4 zeroext %a0) {
+; SSE2-SSSE3-LABEL: bitcast_i4_4i1:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: shrl %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm2
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: bitcast_i4_4i1:
+; AVX1: # BB#0:
+; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl %ecx
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $2, %ecx
+; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrl $3, %eax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: bitcast_i4_4i1:
+; AVX2: # BB#0:
+; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl %ecx
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $2, %ecx
+; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrl $3, %eax
+; AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: bitcast_i4_4i1:
+; AVX512: # BB#0:
+; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %1 = bitcast i4 %a0 to <4 x i1>
+ ret <4 x i1> %1
+}
+
+define <8 x i1> @bitcast_i8_8i1(i8 zeroext %a0) {
+; SSE2-SSSE3-LABEL: bitcast_i8_8i1:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: shrl $7, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm3
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: bitcast_i8_8i1:
+; AVX12: # BB#0:
+; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: movl %eax, %edx
+; AVX12-NEXT: andl $1, %edx
+; AVX12-NEXT: vmovd %edx, %xmm0
+; AVX12-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $2, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $3, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $4, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $5, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $6, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: shrl $7, %eax
+; AVX12-NEXT: movzwl %ax, %eax
+; AVX12-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: bitcast_i8_8i1:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k0
+; AVX512-NEXT: vpmovm2w %k0, %xmm0
+; AVX512-NEXT: retq
+ %1 = bitcast i8 %a0 to <8 x i1>
+ ret <8 x i1> %1
+}
+
+define <16 x i1> @bitcast_i16_16i1(i16 zeroext %a0) {
+; SSE2-SSSE3-LABEL: bitcast_i16_16i1:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $7, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $6, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $5, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $4, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $3, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $2, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $11, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $10, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $9, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $8, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $13, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $12, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-SSSE3-NEXT: movl %eax, %ecx
+; SSE2-SSSE3-NEXT: shrl $14, %ecx
+; SSE2-SSSE3-NEXT: andl $1, %ecx
+; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
+; SSE2-SSSE3-NEXT: shrl $15, %eax
+; SSE2-SSSE3-NEXT: movzwl %ax, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm4
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: bitcast_i16_16i1:
+; AVX12: # BB#0:
+; AVX12-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: movl %eax, %edx
+; AVX12-NEXT: andl $1, %edx
+; AVX12-NEXT: vmovd %edx, %xmm0
+; AVX12-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $2, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $3, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $4, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $5, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $6, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $7, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $8, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $9, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $10, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $11, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $12, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $13, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: movl %eax, %ecx
+; AVX12-NEXT: shrl $14, %ecx
+; AVX12-NEXT: andl $1, %ecx
+; AVX12-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX12-NEXT: shrl $15, %eax
+; AVX12-NEXT: movzwl %ax, %eax
+; AVX12-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: bitcast_i16_16i1:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k0
+; AVX512-NEXT: vpmovm2b %k0, %xmm0
+; AVX512-NEXT: retq
+ %1 = bitcast i16 %a0 to <16 x i1>
+ ret <16 x i1> %1
+}
+
+define <32 x i1> @bitcast_i32_32i1(i32 %a0) {
+; SSE2-SSSE3-LABEL: bitcast_i32_32i1:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movl %esi, (%rdi)
+; SSE2-SSSE3-NEXT: movq %rdi, %rax
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: bitcast_i32_32i1:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi0:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi1:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi2:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $32, %rsp
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $17, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movl %edi, %ecx
+; AVX1-NEXT: shrl $16, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $18, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $19, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $20, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $21, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $22, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $23, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $24, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $25, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $26, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $27, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $28, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $29, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $30, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $31, %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movl %edi, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm1
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $2, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $3, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $5, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $6, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $7, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $8, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $9, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $10, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $11, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $12, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $13, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: shrl $14, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX1-NEXT: shrl $15, %edi
+; AVX1-NEXT: andl $1, %edi
+; AVX1-NEXT: vpinsrb $15, %edi, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: bitcast_i32_32i1:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Lcfi0:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .Lcfi1:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .Lcfi2:
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $32, %rsp
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $17, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: movl %edi, %ecx
+; AVX2-NEXT: shrl $16, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $18, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $19, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $20, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $21, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $22, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $23, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $24, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $25, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $26, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $27, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $28, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $29, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $30, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $31, %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: movl %edi, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm1
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $2, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $3, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $4, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $5, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $6, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $7, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $8, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $9, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $10, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $11, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $12, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $13, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: shrl $14, %eax
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX2-NEXT: shrl $15, %edi
+; AVX2-NEXT: andl $1, %edi
+; AVX2-NEXT: vpinsrb $15, %edi, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: bitcast_i32_32i1:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovd %edi, %k0
+; AVX512-NEXT: vpmovm2b %k0, %ymm0
+; AVX512-NEXT: retq
+ %1 = bitcast i32 %a0 to <32 x i1>
+ ret <32 x i1> %1
+}
+
+define <64 x i1> @bitcast_i64_64i1(i64 %a0) {
+; SSE2-SSSE3-LABEL: bitcast_i64_64i1:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movq %rsi, (%rdi)
+; SSE2-SSSE3-NEXT: movq %rdi, %rax
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX12-LABEL: bitcast_i64_64i1:
+; AVX12: # BB#0:
+; AVX12-NEXT: movq %rsi, (%rdi)
+; AVX12-NEXT: movq %rdi, %rax
+; AVX12-NEXT: retq
+;
+; AVX512-LABEL: bitcast_i64_64i1:
+; AVX512: # BB#0:
+; AVX512-NEXT: kmovq %rdi, %k0
+; AVX512-NEXT: vpmovm2b %k0, %zmm0
+; AVX512-NEXT: retq
+ %1 = bitcast i64 %a0 to <64 x i1>
+ ret <64 x i1> %1
+}
diff --git a/test/CodeGen/X86/bitcast-setcc-128.ll b/test/CodeGen/X86/bitcast-setcc-128.ll
index 9bf7b41a4f26..5616276da08d 100644
--- a/test/CodeGen/X86/bitcast-setcc-128.ll
+++ b/test/CodeGen/X86/bitcast-setcc-128.ll
@@ -1,41 +1,41 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=CHECK,SSE2-SSSE3,SSE2
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+ssse3 < %s | FileCheck %s --check-prefixes=CHECK,SSE2-SSSE3,SSSE3
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=CHECK,AVX12,AVX1
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefixes=CHECK,AVX12,AVX2
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512
define i8 @v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: v8i16:
-; SSE2: ## BB#0:
+; SSE2: # BB#0:
; SSE2-NEXT: pcmpgtw %xmm1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i16:
-; SSSE3: ## BB#0:
+; SSSE3: # BB#0:
; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSSE3-NEXT: retq
;
; AVX12-LABEL: v8i16:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v8i16:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512-NEXT: retq
%x = icmp sgt <8 x i16> %a, %b
%res = bitcast <8 x i1> %x to i8
@@ -44,21 +44,21 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b) {
define i4 @v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-SSSE3-LABEL: v4i32:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i32:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4i32:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
@@ -71,21 +71,21 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b) {
define i4 @v4f32(<4 x float> %a, <4 x float> %b) {
; SSE2-SSSE3-LABEL: v4f32:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm1
; SSE2-SSSE3-NEXT: movmskps %xmm1, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4f32:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4f32:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vcmpltps %xmm0, %xmm1, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
@@ -98,24 +98,24 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b) {
define i16 @v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-SSSE3-LABEL: v16i8:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v16i8:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v16i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; AVX512-NEXT: retq
%x = icmp sgt <16 x i8> %a, %b
%res = bitcast <16 x i1> %x to i16
@@ -124,7 +124,7 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b) {
define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
; SSE2-SSSE3-LABEL: v2i8:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: psllq $56, %xmm0
; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSE2-SSSE3-NEXT: psrad $31, %xmm2
@@ -151,11 +151,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i8:
-; AVX1: ## BB#0:
+; AVX1: # BB#0:
; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1
; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1
@@ -168,11 +168,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i8:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1
; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1
@@ -185,11 +185,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpsllq $56, %xmm1, %xmm1
; AVX512-NEXT: vpsraq $56, %xmm1, %xmm1
; AVX512-NEXT: vpsllq $56, %xmm0, %xmm0
@@ -206,7 +206,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
; SSE2-SSSE3-LABEL: v2i16:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: psllq $48, %xmm0
; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSE2-SSSE3-NEXT: psrad $31, %xmm2
@@ -233,11 +233,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i16:
-; AVX1: ## BB#0:
+; AVX1: # BB#0:
; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1
; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
@@ -250,11 +250,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i16:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1
@@ -267,11 +267,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i16:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpsllq $48, %xmm1, %xmm1
; AVX512-NEXT: vpsraq $48, %xmm1, %xmm1
; AVX512-NEXT: vpsllq $48, %xmm0, %xmm0
@@ -288,7 +288,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
; SSE2-SSSE3-LABEL: v2i32:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: psllq $32, %xmm0
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
; SSE2-SSSE3-NEXT: psrad $31, %xmm0
@@ -311,11 +311,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i32:
-; AVX1: ## BB#0:
+; AVX1: # BB#0:
; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
@@ -326,11 +326,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i32:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
@@ -341,11 +341,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i32:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
@@ -362,7 +362,7 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-SSSE3-LABEL: v2i64:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1
; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0
@@ -375,18 +375,18 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v2i64:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskpd %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v2i64:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
@@ -399,21 +399,21 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) {
define i2 @v2f64(<2 x double> %a, <2 x double> %b) {
; SSE2-SSSE3-LABEL: v2f64:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm1
; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v2f64:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
; AVX12-NEXT: vmovmskpd %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v2f64:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
@@ -426,29 +426,29 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b) {
define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) {
; SSE2-SSSE3-LABEL: v4i8:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: pslld $24, %xmm1
; SSE2-SSSE3-NEXT: psrad $24, %xmm1
; SSE2-SSSE3-NEXT: pslld $24, %xmm0
; SSE2-SSSE3-NEXT: psrad $24, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i8:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpslld $24, %xmm1, %xmm1
; AVX12-NEXT: vpsrad $24, %xmm1, %xmm1
; AVX12-NEXT: vpslld $24, %xmm0, %xmm0
; AVX12-NEXT: vpsrad $24, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpslld $24, %xmm1, %xmm1
; AVX512-NEXT: vpsrad $24, %xmm1, %xmm1
; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
@@ -465,29 +465,29 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) {
define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) {
; SSE2-SSSE3-LABEL: v4i16:
-; SSE2-SSSE3: ## BB#0:
+; SSE2-SSSE3: # BB#0:
; SSE2-SSSE3-NEXT: pslld $16, %xmm1
; SSE2-SSSE3-NEXT: psrad $16, %xmm1
; SSE2-SSSE3-NEXT: pslld $16, %xmm0
; SSE2-SSSE3-NEXT: psrad $16, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
-; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i16:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpslld $16, %xmm1, %xmm1
; AVX12-NEXT: vpsrad $16, %xmm1, %xmm1
; AVX12-NEXT: vpslld $16, %xmm0, %xmm0
; AVX12-NEXT: vpsrad $16, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v4i16:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpslld $16, %xmm1, %xmm1
; AVX512-NEXT: vpsrad $16, %xmm1, %xmm1
; AVX512-NEXT: vpslld $16, %xmm0, %xmm0
@@ -504,7 +504,7 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) {
define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) {
; SSE2-LABEL: v8i8:
-; SSE2: ## BB#0:
+; SSE2: # BB#0:
; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: psraw $8, %xmm1
; SSE2-NEXT: psllw $8, %xmm0
@@ -513,11 +513,11 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) {
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i8:
-; SSSE3: ## BB#0:
+; SSSE3: # BB#0:
; SSSE3-NEXT: psllw $8, %xmm1
; SSSE3-NEXT: psraw $8, %xmm1
; SSSE3-NEXT: psllw $8, %xmm0
@@ -525,11 +525,11 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) {
; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSSE3-NEXT: retq
;
; AVX12-LABEL: v8i8:
-; AVX12: ## BB#0:
+; AVX12: # BB#0:
; AVX12-NEXT: vpsllw $8, %xmm1, %xmm1
; AVX12-NEXT: vpsraw $8, %xmm1, %xmm1
; AVX12-NEXT: vpsllw $8, %xmm0, %xmm0
@@ -537,18 +537,18 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) {
; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX12-NEXT: retq
;
; AVX512-LABEL: v8i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpsllw $8, %xmm1, %xmm1
; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1
; AVX512-NEXT: vpsllw $8, %xmm0, %xmm0
; AVX512-NEXT: vpsraw $8, %xmm0, %xmm0
; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512-NEXT: retq
%x = icmp sgt <8 x i8> %a, %b
%res = bitcast <8 x i1> %x to i8
diff --git a/test/CodeGen/X86/bitcast-setcc-256.ll b/test/CodeGen/X86/bitcast-setcc-256.ll
index b2c619c48d4d..86475c42e79e 100644
--- a/test/CodeGen/X86/bitcast-setcc-256.ll
+++ b/test/CodeGen/X86/bitcast-setcc-256.ll
@@ -1,23 +1,47 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX2
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+SSE2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+SSSE3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefix=AVX512
define i16 @v16i16(<16 x i16> %a, <16 x i16> %b) {
+; SSE2-SSSE3-LABEL: v16i16:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: pcmpgtw %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
+; SSE2-SSSE3-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: v16i16:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: v16i16:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%x = icmp sgt <16 x i16> %a, %b
@@ -26,19 +50,53 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b) {
}
define i8 @v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: v8i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT: packsswb %xmm1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: v8i32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT: packsswb %xmm1, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: pmovmskb %xmm0, %eax
+; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: v8i32:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovmskps %ymm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: v8i32:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%x = icmp sgt <8 x i32> %a, %b
@@ -47,19 +105,51 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b) {
}
define i8 @v8f32(<8 x float> %a, <8 x float> %b) {
+; SSE2-LABEL: v8f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: cmpltps %xmm1, %xmm3
+; SSE2-NEXT: cmpltps %xmm0, %xmm2
+; SSE2-NEXT: packsswb %xmm3, %xmm2
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: packuswb %xmm2, %xmm2
+; SSE2-NEXT: pmovmskb %xmm2, %eax
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: v8f32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: cmpltps %xmm1, %xmm3
+; SSSE3-NEXT: cmpltps %xmm0, %xmm2
+; SSSE3-NEXT: packsswb %xmm3, %xmm2
+; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: pmovmskb %xmm2, %eax
+; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: v8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: v8f32:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vmovmskps %ymm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: v8f32:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %k0
; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%x = fcmp ogt <8 x float> %a, %b
@@ -68,15 +158,241 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b) {
}
define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) {
+; SSE2-SSSE3-LABEL: v32i8:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: pcmpgtb %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-SSSE3-NEXT: andb $1, %cl
+; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-SSSE3-NEXT: andb $1, %cl
+; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: andb $1, %al
+; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-SSSE3-NEXT: shll $16, %ecx
+; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: orl %ecx, %eax
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi0:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi1:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi2:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $32, %rsp
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpextrb $15, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: movl (%rsp), %eax
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: v32i8:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: v32i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: vzeroupper
@@ -87,16 +403,56 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) {
}
define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-SSSE3-LABEL: v4i64:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm3
+; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm1
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-SSSE3-NEXT: pand %xmm6, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-SSSE3-NEXT: por %xmm1, %xmm3
+; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0
+; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: packsswb %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: movmskps %xmm1, %eax
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovmskps %xmm0, %eax
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: v4i64:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovmskpd %ymm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: v4i64:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
@@ -109,16 +465,35 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) {
}
define i4 @v4f64(<4 x double> %a, <4 x double> %b) {
+; SSE2-SSSE3-LABEL: v4f64:
+; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3-NEXT: cmpltpd %xmm1, %xmm3
+; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: packsswb %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax
+; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: retq
+;
+; AVX1-LABEL: v4f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovmskps %xmm0, %eax
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: v4f64:
-; AVX2: ## BB#0:
+; AVX2: # BB#0:
; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vmovmskpd %ymm0, %eax
-; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: v4f64:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
diff --git a/test/CodeGen/X86/bitcast-setcc-512.ll b/test/CodeGen/X86/bitcast-setcc-512.ll
new file mode 100644
index 000000000000..4a5ef99a8653
--- /dev/null
+++ b/test/CodeGen/X86/bitcast-setcc-512.ll
@@ -0,0 +1,1377 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
+
+define i32 @v32i16(<32 x i16> %a, <32 x i16> %b) {
+; SSE-LABEL: v32i16:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtw %xmm7, %xmm3
+; SSE-NEXT: pextrb $14, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pcmpgtw %xmm6, %xmm2
+; SSE-NEXT: pextrb $14, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pcmpgtw %xmm5, %xmm1
+; SSE-NEXT: pextrb $14, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pcmpgtw %xmm4, %xmm0
+; SSE-NEXT: pextrb $14, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; SSE-NEXT: shll $16, %ecx
+; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: orl %ecx, %eax
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v32i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi0:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi1:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi2:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $32, %rsp
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpcmpgtw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpextrb $14, %xmm4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm4, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpextrb $14, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpextrb $14, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm1, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpcmpgtw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: andl $1, %eax
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: movl (%rsp), %eax
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v32i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: .Lcfi0:
+; AVX512F-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-NEXT: .Lcfi1:
+; AVX512F-NEXT: .cfi_offset %rbp, -16
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: .Lcfi2:
+; AVX512F-NEXT: .cfi_def_cfa_register %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $32, %rsp
+; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %ecx
+; AVX512F-NEXT: vmovd %ecx, %xmm1
+; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %ecx
+; AVX512F-NEXT: vmovd %ecx, %xmm0
+; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, (%rsp)
+; AVX512F-NEXT: movl (%rsp), %eax
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x = icmp sgt <32 x i16> %a, %b
+ %res = bitcast <32 x i1> %x to i32
+ ret i32 %res
+}
+
+define i16 @v16i32(<16 x i32> %a, <16 x i32> %b) {
+; SSE-LABEL: v16i32:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtd %xmm7, %xmm3
+; SSE-NEXT: pcmpgtd %xmm6, %xmm2
+; SSE-NEXT: packsswb %xmm3, %xmm2
+; SSE-NEXT: pcmpgtd %xmm5, %xmm1
+; SSE-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: packsswb %xmm2, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v16i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v16i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v16i32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v16i32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x = icmp sgt <16 x i32> %a, %b
+ %res = bitcast <16 x i1> %x to i16
+ ret i16 %res
+}
+
+define i16 @v16f32(<16 x float> %a, <16 x float> %b) {
+; SSE-LABEL: v16f32:
+; SSE: # BB#0:
+; SSE-NEXT: cmpltps %xmm3, %xmm7
+; SSE-NEXT: cmpltps %xmm2, %xmm6
+; SSE-NEXT: packsswb %xmm7, %xmm6
+; SSE-NEXT: cmpltps %xmm1, %xmm5
+; SSE-NEXT: cmpltps %xmm0, %xmm4
+; SSE-NEXT: packsswb %xmm5, %xmm4
+; SSE-NEXT: packsswb %xmm6, %xmm4
+; SSE-NEXT: pmovmskb %xmm4, %eax
+; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v16f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v16f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v16f32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v16f32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vcmpltps %zmm0, %zmm1, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x = fcmp ogt <16 x float> %a, %b
+ %res = bitcast <16 x i1> %x to i16
+ ret i16 %res
+}
+
+define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) {
+; SSE-LABEL: v64i8:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtb %xmm5, %xmm1
+; SSE-NEXT: pextrb $15, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $14, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $13, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $11, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $9, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $7, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $5, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $3, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $1, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm1, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pcmpgtb %xmm4, %xmm0
+; SSE-NEXT: pextrb $15, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $14, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $13, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $11, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $9, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $7, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $5, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $3, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $1, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm0, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pcmpgtb %xmm7, %xmm3
+; SSE-NEXT: pextrb $15, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $14, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $13, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $11, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $9, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $7, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $5, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $3, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $1, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm3, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pcmpgtb %xmm6, %xmm2
+; SSE-NEXT: pextrb $15, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $14, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $13, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $12, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $11, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $10, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $9, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $8, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $7, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $6, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $5, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $4, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $3, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $2, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $1, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: pextrb $0, %xmm2, %eax
+; SSE-NEXT: andb $1, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx
+; SSE-NEXT: orl %eax, %ecx
+; SSE-NEXT: movl -{{[0-9]+}}(%rsp), %edx
+; SSE-NEXT: shll $16, %edx
+; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: orl %edx, %eax
+; SSE-NEXT: shlq $32, %rax
+; SSE-NEXT: orq %rcx, %rax
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v64i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Lcfi3:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: .Lcfi4:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: .Lcfi5:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpcmpgtb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpextrb $15, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rsp)
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm0
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: movl (%rsp), %ecx
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v64i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Lcfi0:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .Lcfi1:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .Lcfi2:
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $15, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $14, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $13, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $12, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $11, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $10, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $9, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $8, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $7, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $6, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $5, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $4, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $3, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $2, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $1, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $0, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rsp)
+; AVX2-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrb $15, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $14, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $13, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $12, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $11, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $10, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $9, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $8, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $7, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $6, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $5, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $4, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $3, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $2, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $1, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $0, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl (%rsp), %ecx
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: .Lcfi3:
+; AVX512F-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-NEXT: .Lcfi4:
+; AVX512F-NEXT: .cfi_offset %rbp, -16
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: .Lcfi5:
+; AVX512F-NEXT: .cfi_def_cfa_register %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
+; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, (%rsp)
+; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl (%rsp), %ecx
+; AVX512F-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; AVX512F-NEXT: shlq $32, %rax
+; AVX512F-NEXT: orq %rcx, %rax
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x = icmp sgt <64 x i8> %a, %b
+ %res = bitcast <64 x i1> %x to i64
+ ret i64 %res
+}
+
+define i8 @v8i64(<8 x i64> %a, <8 x i64> %b) {
+; SSE-LABEL: v8i64:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtq %xmm7, %xmm3
+; SSE-NEXT: pcmpgtq %xmm6, %xmm2
+; SSE-NEXT: packsswb %xmm3, %xmm2
+; SSE-NEXT: pcmpgtq %xmm5, %xmm1
+; SSE-NEXT: pcmpgtq %xmm4, %xmm0
+; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: packsswb %xmm2, %xmm0
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v8i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v8i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v8i64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v8i64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x = icmp sgt <8 x i64> %a, %b
+ %res = bitcast <8 x i1> %x to i8
+ ret i8 %res
+}
+
+define i8 @v8f64(<8 x double> %a, <8 x double> %b) {
+; SSE-LABEL: v8f64:
+; SSE: # BB#0:
+; SSE-NEXT: cmpltpd %xmm3, %xmm7
+; SSE-NEXT: cmpltpd %xmm2, %xmm6
+; SSE-NEXT: packsswb %xmm7, %xmm6
+; SSE-NEXT: cmpltpd %xmm1, %xmm5
+; SSE-NEXT: cmpltpd %xmm0, %xmm4
+; SSE-NEXT: packsswb %xmm5, %xmm4
+; SSE-NEXT: packsswb %xmm6, %xmm4
+; SSE-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE-NEXT: pmovmskb %xmm4, %eax
+; SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v8f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v8f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v8f64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v8f64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ %x = fcmp ogt <8 x double> %a, %b
+ %res = bitcast <8 x i1> %x to i8
+ ret i8 %res
+}
diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll
index c7de65d84507..b3f6534d14b3 100644
--- a/test/CodeGen/X86/block-placement.ll
+++ b/test/CodeGen/X86/block-placement.ll
@@ -354,6 +354,7 @@ define void @unnatural_cfg2() {
; single-source GCC.
; CHECK-LABEL: unnatural_cfg2
; CHECK: %entry
+; CHECK: %loop.header
; CHECK: %loop.body1
; CHECK: %loop.body2
; CHECK: %loop.body4
@@ -361,7 +362,6 @@ define void @unnatural_cfg2() {
; CHECK: %loop.inner2.begin
; CHECK: %loop.body3
; CHECK: %loop.inner1.begin
-; CHECK: %loop.header
; CHECK: %bail
entry:
@@ -1491,6 +1491,102 @@ ret: ; preds = %endif, %then
ret void
}
+define i32 @not_rotate_if_extra_branch(i32 %count) {
+; Test checks that there is no loop rotation
+; if it introduces extra branch.
+; Specifically in this case because best exit is .header
+; but it has fallthrough to .middle block and last block in
+; loop chain .slow does not have afallthrough to .header.
+; CHECK-LABEL: not_rotate_if_extra_branch
+; CHECK: %.entry
+; CHECK: %.header
+; CHECK: %.middle
+; CHECK: %.backedge
+; CHECK: %.slow
+; CHECK: %.bailout
+; CHECK: %.stop
+.entry:
+ %sum.0 = shl nsw i32 %count, 1
+ br label %.header
+
+.header:
+ %i = phi i32 [ %i.1, %.backedge ], [ 0, %.entry ]
+ %sum = phi i32 [ %sum.1, %.backedge ], [ %sum.0, %.entry ]
+ %is_exc = icmp sgt i32 %i, 9000000
+ br i1 %is_exc, label %.bailout, label %.middle, !prof !13
+
+.bailout:
+ %sum.2 = add nsw i32 %count, 1
+ br label %.stop
+
+.middle:
+ %pr.1 = and i32 %i, 1023
+ %pr.2 = icmp eq i32 %pr.1, 0
+ br i1 %pr.2, label %.slow, label %.backedge, !prof !14
+
+.slow:
+ tail call void @effect(i32 %sum)
+ br label %.backedge
+
+.backedge:
+ %sum.1 = add nsw i32 %i, %sum
+ %i.1 = add nsw i32 %i, 1
+ %end = icmp slt i32 %i.1, %count
+ br i1 %end, label %.header, label %.stop, !prof !15
+
+.stop:
+ %sum.phi = phi i32 [ %sum.1, %.backedge ], [ %sum.2, %.bailout ]
+ ret i32 %sum.phi
+}
+
+define i32 @not_rotate_if_extra_branch_regression(i32 %count, i32 %init) {
+; This is a regression test against patch avoid loop rotation if
+; it introduce an extra btanch.
+; CHECK-LABEL: not_rotate_if_extra_branch_regression
+; CHECK: %.entry
+; CHECK: %.first_backedge
+; CHECK: %.slow
+; CHECK: %.second_header
+.entry:
+ %sum.0 = shl nsw i32 %count, 1
+ br label %.first_header
+
+.first_header:
+ %i = phi i32 [ %i.1, %.first_backedge ], [ 0, %.entry ]
+ %is_bo1 = icmp sgt i32 %i, 9000000
+ br i1 %is_bo1, label %.bailout, label %.first_backedge, !prof !14
+
+.first_backedge:
+ %i.1 = add nsw i32 %i, 1
+ %end = icmp slt i32 %i.1, %count
+ br i1 %end, label %.first_header, label %.second_header, !prof !13
+
+.second_header:
+ %j = phi i32 [ %j.1, %.second_backedge ], [ %init, %.first_backedge ]
+ %end.2 = icmp sgt i32 %j, %count
+ br i1 %end.2, label %.stop, label %.second_middle, !prof !14
+
+.second_middle:
+ %is_slow = icmp sgt i32 %j, 9000000
+ br i1 %is_slow, label %.slow, label %.second_backedge, !prof !14
+
+.slow:
+ tail call void @effect(i32 %j)
+ br label %.second_backedge
+
+.second_backedge:
+ %j.1 = add nsw i32 %j, 1
+ %end.3 = icmp slt i32 %j, 10000000
+ br i1 %end.3, label %.second_header, label %.stop, !prof !13
+
+.stop:
+ %res = add nsw i32 %j, %i.1
+ ret i32 %res
+
+.bailout:
+ ret i32 0
+}
+
declare void @effect(i32)
!5 = !{!"branch_weights", i32 84, i32 16}
@@ -1501,3 +1597,6 @@ declare void @effect(i32)
!10 = !{!"branch_weights", i32 90, i32 10}
!11 = !{!"branch_weights", i32 1, i32 1}
!12 = !{!"branch_weights", i32 5, i32 3}
+!13 = !{!"branch_weights", i32 1, i32 1}
+!14 = !{!"branch_weights", i32 1, i32 1023}
+!15 = !{!"branch_weights", i32 4095, i32 1}
diff --git a/test/CodeGen/X86/bool-simplify.ll b/test/CodeGen/X86/bool-simplify.ll
index a0a1c3646624..7f7f9791d903 100644
--- a/test/CodeGen/X86/bool-simplify.ll
+++ b/test/CodeGen/X86/bool-simplify.ll
@@ -1,45 +1,62 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse4.1,-avx,+rdrnd,+rdseed | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1,-avx,+rdrnd,+rdseed | FileCheck %s
define i32 @foo(<2 x i64> %c, i32 %a, i32 %b) {
+; CHECK-LABEL: foo:
+; CHECK: # BB#0:
+; CHECK-NEXT: ptest %xmm0, %xmm0
+; CHECK-NEXT: cmovnel %esi, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
%t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
%t2 = icmp ne i32 %t1, 0
%t3 = select i1 %t2, i32 %a, i32 %b
ret i32 %t3
-; CHECK: foo
-; CHECK: ptest
-; CHECK-NOT: testl
-; CHECK: cmov
-; CHECK: ret
}
define i32 @bar(<2 x i64> %c) {
+; CHECK-LABEL: bar:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: ptest %xmm0, %xmm0
+; CHECK-NEXT: jne .LBB1_2
+; CHECK-NEXT: # BB#1: # %if-true-block
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB1_2: # %endif-block
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: retq
entry:
%0 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
%1 = icmp ne i32 %0, 0
br i1 %1, label %if-true-block, label %endif-block
-if-true-block: ; preds = %entry
+if-true-block:
ret i32 0
-endif-block: ; preds = %entry,
+endif-block:
ret i32 1
-; CHECK: bar
-; CHECK: ptest
-; CHECK-NOT: testl
-; CHECK: jne
-; CHECK: ret
}
define i32 @bax(<2 x i64> %c) {
+; CHECK-LABEL: bax:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: ptest %xmm0, %xmm0
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
%t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
%t2 = icmp eq i32 %t1, 1
%t3 = zext i1 %t2 to i32
ret i32 %t3
-; CHECK: bax
-; CHECK: ptest
-; CHECK-NOT: cmpl
-; CHECK: ret
}
-define i16 @rnd16(i16 %arg) nounwind uwtable {
+define i16 @rnd16(i16 %arg) nounwind {
+; CHECK-LABEL: rnd16:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: rdrandw %cx
+; CHECK-NEXT: cmovbw %di, %ax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq
%1 = tail call { i16, i32 } @llvm.x86.rdrand.16() nounwind
%2 = extractvalue { i16, i32 } %1, 0
%3 = extractvalue { i16, i32 } %1, 1
@@ -47,14 +64,16 @@ define i16 @rnd16(i16 %arg) nounwind uwtable {
%5 = select i1 %4, i16 0, i16 %arg
%6 = add i16 %5, %2
ret i16 %6
-; CHECK: rnd16
-; CHECK: rdrand
-; CHECK: cmov
-; CHECK-NOT: cmov
-; CHECK: ret
}
-define i32 @rnd32(i32 %arg) nounwind uwtable {
+define i32 @rnd32(i32 %arg) nounwind {
+; CHECK-LABEL: rnd32:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: rdrandl %ecx
+; CHECK-NEXT: cmovbl %edi, %eax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: retq
%1 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
%2 = extractvalue { i32, i32 } %1, 0
%3 = extractvalue { i32, i32 } %1, 1
@@ -62,14 +81,16 @@ define i32 @rnd32(i32 %arg) nounwind uwtable {
%5 = select i1 %4, i32 0, i32 %arg
%6 = add i32 %5, %2
ret i32 %6
-; CHECK: rnd32
-; CHECK: rdrand
-; CHECK: cmov
-; CHECK-NOT: cmov
-; CHECK: ret
}
-define i64 @rnd64(i64 %arg) nounwind uwtable {
+define i64 @rnd64(i64 %arg) nounwind {
+; CHECK-LABEL: rnd64:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: rdrandq %rcx
+; CHECK-NEXT: cmovbq %rdi, %rax
+; CHECK-NEXT: addq %rcx, %rax
+; CHECK-NEXT: retq
%1 = tail call { i64, i32 } @llvm.x86.rdrand.64() nounwind
%2 = extractvalue { i64, i32 } %1, 0
%3 = extractvalue { i64, i32 } %1, 1
@@ -77,14 +98,17 @@ define i64 @rnd64(i64 %arg) nounwind uwtable {
%5 = select i1 %4, i64 0, i64 %arg
%6 = add i64 %5, %2
ret i64 %6
-; CHECK: rnd64
-; CHECK: rdrand
-; CHECK: cmov
-; CHECK-NOT: cmov
-; CHECK: ret
}
-define i16 @seed16(i16 %arg) nounwind uwtable {
+define i16 @seed16(i16 %arg) nounwind {
+; CHECK-LABEL: seed16:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: rdseedw %cx
+; CHECK-NEXT: cmovbw %di, %ax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq
%1 = tail call { i16, i32 } @llvm.x86.rdseed.16() nounwind
%2 = extractvalue { i16, i32 } %1, 0
%3 = extractvalue { i16, i32 } %1, 1
@@ -92,14 +116,16 @@ define i16 @seed16(i16 %arg) nounwind uwtable {
%5 = select i1 %4, i16 0, i16 %arg
%6 = add i16 %5, %2
ret i16 %6
-; CHECK: seed16
-; CHECK: rdseed
-; CHECK: cmov
-; CHECK-NOT: cmov
-; CHECK: ret
}
-define i32 @seed32(i32 %arg) nounwind uwtable {
+define i32 @seed32(i32 %arg) nounwind {
+; CHECK-LABEL: seed32:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: rdseedl %ecx
+; CHECK-NEXT: cmovbl %edi, %eax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: retq
%1 = tail call { i32, i32 } @llvm.x86.rdseed.32() nounwind
%2 = extractvalue { i32, i32 } %1, 0
%3 = extractvalue { i32, i32 } %1, 1
@@ -107,14 +133,16 @@ define i32 @seed32(i32 %arg) nounwind uwtable {
%5 = select i1 %4, i32 0, i32 %arg
%6 = add i32 %5, %2
ret i32 %6
-; CHECK: seed32
-; CHECK: rdseed
-; CHECK: cmov
-; CHECK-NOT: cmov
-; CHECK: ret
}
-define i64 @seed64(i64 %arg) nounwind uwtable {
+define i64 @seed64(i64 %arg) nounwind {
+; CHECK-LABEL: seed64:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: rdseedq %rcx
+; CHECK-NEXT: cmovbq %rdi, %rax
+; CHECK-NEXT: addq %rcx, %rax
+; CHECK-NEXT: retq
%1 = tail call { i64, i32 } @llvm.x86.rdseed.64() nounwind
%2 = extractvalue { i64, i32 } %1, 0
%3 = extractvalue { i64, i32 } %1, 1
@@ -122,11 +150,6 @@ define i64 @seed64(i64 %arg) nounwind uwtable {
%5 = select i1 %4, i64 0, i64 %arg
%6 = add i64 %5, %2
ret i64 %6
-; CHECK: seed64
-; CHECK: rdseed
-; CHECK: cmov
-; CHECK-NOT: cmov
-; CHECK: ret
}
declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
index bbe31c5c2ac5..14bdb3853b03 100644
--- a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
+++ b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
@@ -1,13 +1,12 @@
-; NOTE: Assertions have been simpilfied MANUALLY after running utils/update_llc_test_checks.py
-; Assertions for constant pools have been added MANUALLY.
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX2
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX512
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=AVX512BW -check-prefix=AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=NO-AVX512BW -check-prefix=AVX2 -check-prefix=AVX2-64
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=NO-AVX512BW -check-prefix=AVX512 -check-prefix=AVX512F-64
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=AVX512BW -check-prefix=AVX512 -check-prefix=AVX512BW-64
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX2
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX512
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL32 -check-prefix=AVX512 -check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX-64
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL64 -check-prefix=NO-AVX512BW-64 -check-prefix=AVX2-64
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL64 -check-prefix=NO-AVX512BW-64 -check-prefix=AVX512F-64
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL64 -check-prefix=AVX512F-64 -check-prefix=AVX512BW-64
;===-----------------------------------------------------------------------------===
; This test checks the ability to recognize a cross element pattern of
@@ -17,20 +16,31 @@
; <i32 0, i32 1, i32 0, i32 1> => broadcast of the constant vector <i32 0, i32 1>
;===-----------------------------------------------------------------------------===
-; ALL: LCPI0
-; ALL-NEXT: .short 256 # 0x100
-
define <16 x i8> @f16xi8_i16(<16 x i8> %a) {
+; AVX-LABEL: f16xi8_i16:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f16xi8_i16:
; ALL32: # BB#0:
-; ALL32-NEXT: vpbroadcastw {{\.LCPI.*}}, %xmm1
+; ALL32-NEXT: vpbroadcastw {{.*#+}} xmm1 = [256,256,256,256,256,256,256,256]
; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f16xi8_i16:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f16xi8_i16:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastw {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpbroadcastw {{.*#+}} xmm1 = [256,256,256,256,256,256,256,256]
; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
@@ -40,45 +50,48 @@ define <16 x i8> @f16xi8_i16(<16 x i8> %a) {
}
-; ALL: .LCPI1
-; ALL-NEXT: .long 50462976 # 0x3020100
-
-; AVX: .LCPI1
-; AVX-NEXT .long 50462976 # float 3.82047143E-37
-
define <16 x i8> @f16xi8_i32(<16 x i8> %a) {
+; AVX-LABEL: f16xi8_i32:
+; AVX: # BB#0:
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f16xi8_i32:
; ALL32: # BB#0:
-; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1
+; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976]
; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f16xi8_i32:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
+; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f16xi8_i32:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976]
; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
-;
-; AVX-LABEL: f16xi8_i32:
-; AVX: # BB#0:
-; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm1
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
%res1 = add <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a
%res2 = and <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1
ret <16 x i8> %res2
}
-; ALL64: .LCPI2
-; ALL64-NEXT: .quad 506097522914230528 # 0x706050403020100
-
-; AVX: .LCPI2
-; AVX-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275
-
define <16 x i8> @f16xi8_i64(<16 x i8> %a) {
+; AVX-LABEL: f16xi8_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f16xi8_i64:
; ALL32: # BB#0:
; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
@@ -86,38 +99,56 @@ define <16 x i8> @f16xi8_i64(<16 x i8> %a) {
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f16xi8_i64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f16xi8_i64:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [506097522914230528,506097522914230528]
; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
-;
-; AVX-LABEL: f16xi8_i64:
-; AVX: # BB#0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
%res1 = add <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a
%res2 = and <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1
ret <16 x i8> %res2
}
-; ALL: .LCPI3
-; ALL-NEXT: .short 256 # 0x100
-
define <32 x i8> @f32xi8_i16(<32 x i8> %a) {
+; AVX-LABEL: f32xi8_i16:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f32xi8_i16:
; ALL32: # BB#0:
-; ALL32-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm1
+; ALL32-NEXT: vpbroadcastw {{.*#+}} ymm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f32xi8_i16:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f32xi8_i16:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastw {{.*}}(%rip), %ymm1
+; ALL64-NEXT: vpbroadcastw {{.*#+}} ymm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT: retq
@@ -127,155 +158,273 @@ define <32 x i8> @f32xi8_i16(<32 x i8> %a) {
}
-; ALL: .LCPI4
-; ALL-NEXT: .long 50462976 # 0x3020100
-
-; AVX: .LCPI4
-; AVX-NEXT: .long 50462976 # float 3.82047143E-37
-
define <32 x i8> @f32xi8_i32(<32 x i8> %a) {
+; AVX-LABEL: f32xi8_i32:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
+; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f32xi8_i32:
; ALL32: # BB#0:
-; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1
+; ALL32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f32xi8_i32:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
+; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f32xi8_i32:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; ALL64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT: retq
-;
-; AVX-LABEL: f32xi8_i32:
-; AVX: # BB#0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm2
-; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
%res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a
%res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1
ret <32 x i8> %res2
}
-; ALL64: .LCPI5
-; ALL64-NEXT: .quad 506097522914230528 # 0x706050403020100
-
-; AVX: .LCPI5
-; AVX-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275
-
define <32 x i8> @f32xi8_i64(<32 x i8> %a) {
+; AVX-LABEL: f32xi8_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f32xi8_i64:
; ALL32: # BB#0:
-; ALL32-NEXT: vpbroadcastq {{\.LCPI.*}}, %ymm1
+; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275]
; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f32xi8_i64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f32xi8_i64:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
+; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT: retq
-;
-; AVX-LABEL: f32xi8_i64:
-; AVX: # BB#0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
-; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
%res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a
%res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1
ret <32 x i8> %res2
}
-; ALL: .LCPI6
-; ALL-NEXT: .byte 0 # 0x0
-; ALL-NEXT: .byte 1 # 0x1
-; ALL-NEXT: .byte 2 # 0x2
-; ALL-NEXT: .byte 3 # 0x3
-; ALL-NEXT: .byte 4 # 0x4
-; ALL-NEXT: .byte 5 # 0x5
-; ALL-NEXT: .byte 6 # 0x6
-; ALL-NEXT: .byte 7 # 0x7
-; ALL-NEXT: .byte 8 # 0x8
-; ALL-NEXT: .byte 9 # 0x9
-; ALL-NEXT: .byte 10 # 0xa
-; ALL-NEXT: .byte 11 # 0xb
-; ALL-NEXT: .byte 12 # 0xc
-; ALL-NEXT: .byte 13 # 0xd
-; ALL-NEXT: .byte 14 # 0xe
-; ALL-NEXT: .byte 15 # 0xf
-; ALL-NOT: .byte
-
define <32 x i8> @f32xi8_i128(<32 x i8> %a) {
-; ALL-LABEL: f32xi8_i128:
-; ALL: # BB#0:
-; ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; ALL-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX-LABEL: f32xi8_i128:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; ALL32-LABEL: f32xi8_i128:
+; ALL32: # BB#0:
+; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; ALL32-NEXT: # ymm1 = mem[0,1,0,1]
+; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: retl
+;
+; AVX-64-LABEL: f32xi8_i128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: retq
+;
+; ALL64-LABEL: f32xi8_i128:
+; ALL64: # BB#0:
+; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; ALL64-NEXT: # ymm1 = mem[0,1,0,1]
+; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: retq
%res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %a
%res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %res1
ret <32 x i8> %res2
}
-; ALL: .LCPI7
-; ALL-NEXT: .short 256 # 0x100
-
define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
+; AVX-LABEL: f64xi8_i16:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: retl
+;
; NO-AVX512BW-LABEL: f64xi8_i16:
; NO-AVX512BW: # BB#0:
-; NO-AVX512BW-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm2
+; NO-AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f64xi8_i16:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpbroadcastw {{\.LCPI.*}}, %zmm1
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retl
+;
+; AVX-64-LABEL: f64xi8_i16:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; NO-AVX512BW-64-LABEL: f64xi8_i16:
+; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
+; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: retq
+;
+; AVX512BW-64-LABEL: f64xi8_i16:
+; AVX512BW-64: # BB#0:
+; AVX512BW-64-NEXT: vpbroadcastw {{.*#+}} zmm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
+; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: retq
%res1 = add <64 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %a
%res2 = and <64 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %res1
ret <64 x i8> %res2
}
-; ALL: .LCPI8
-; ALL-NEXT: .long 50462976 # 0x3020100
-
-; AVX: .LCPI8
-; AVX-NEXT: .long 50462976 # float 3.82047143E-37
-
define <64 x i8> @f64i8_i32(<64 x i8> %a) {
+; AVX-LABEL: f64i8_i32:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
+; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: retl
+;
; NO-AVX512BW-LABEL: f64i8_i32:
; NO-AVX512BW: # BB#0:
-; NO-AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm2
+; NO-AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f64i8_i32:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %zmm1
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retl
;
-; AVX-LABEL: f64i8_i32:
+; AVX-64-LABEL: f64i8_i32:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
+; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; NO-AVX512BW-64-LABEL: f64i8_i32:
+; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
+; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: retq
+;
+; AVX512BW-64-LABEL: f64i8_i32:
+; AVX512BW-64: # BB#0:
+; AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
+; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: retq
+ %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a
+ %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1
+ ret <64 x i8> %res2
+}
+
+
+define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
+; AVX-LABEL: f64xi8_i64:
; AVX: # BB#0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm3
+; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -283,43 +432,69 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) {
; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
- %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a
- %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1
- ret <64 x i8> %res2
-}
-
-
-; ALL64: .LCPI9
-; ALL64-NEXT: .quad 506097522914230528 # 0x706050403020100
-
-; ALL32: .LCPI9
-; ALL32-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275
-
-; AVX: .LCPI9
-; AVX-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275
-
-define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
+; AVX-NEXT: retl
+;
; NO-AVX512BW-LABEL: f64xi8_i64:
; NO-AVX512BW: # BB#0:
-; NO-AVX512BW-NEXT: vpbroadcastq {{.*}}, %ymm2
+; NO-AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275]
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f64xi8_i64:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpbroadcastq {{.*}}, %zmm1
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275]
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retl
;
-; AVX-LABEL: f64xi8_i64:
+; AVX-64-LABEL: f64xi8_i64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; NO-AVX512BW-64-LABEL: f64xi8_i64:
+; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
+; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: retq
+;
+; AVX512BW-64-LABEL: f64xi8_i64:
+; AVX512BW-64: # BB#0:
+; AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528]
+; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: retq
+ %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a
+ %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1
+ ret <64 x i8> %res2
+}
+
+
+define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
+; AVX-LABEL: f64xi8_i128:
; AVX: # BB#0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -327,143 +502,184 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
- %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a
- %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1
- ret <64 x i8> %res2
-}
-
-
-; ALL: .LCPI10
-; ALL-NEXT: .byte 0 # 0x0
-; ALL-NEXT: .byte 1 # 0x1
-; ALL-NEXT: .byte 2 # 0x2
-; ALL-NEXT: .byte 3 # 0x3
-; ALL-NEXT: .byte 4 # 0x4
-; ALL-NEXT: .byte 5 # 0x5
-; ALL-NEXT: .byte 6 # 0x6
-; ALL-NEXT: .byte 7 # 0x7
-; ALL-NEXT: .byte 8 # 0x8
-; ALL-NEXT: .byte 9 # 0x9
-; ALL-NEXT: .byte 10 # 0xa
-; ALL-NEXT: .byte 11 # 0xb
-; ALL-NEXT: .byte 12 # 0xc
-; ALL-NEXT: .byte 13 # 0xd
-; ALL-NEXT: .byte 14 # 0xe
-; ALL-NEXT: .byte 15 # 0xf
-; ALL-NOT: .byte
-
-define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
+; AVX-NEXT: retl
+;
; NO-AVX512BW-LABEL: f64xi8_i128:
; NO-AVX512BW: # BB#0:
-; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NO-AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f64xi8_i128:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retl
+;
+; AVX-64-LABEL: f64xi8_i128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; NO-AVX512BW-64-LABEL: f64xi8_i128:
+; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NO-AVX512BW-64-NEXT: # ymm2 = mem[0,1,0,1]
+; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: retq
+;
+; AVX512BW-64-LABEL: f64xi8_i128:
+; AVX512BW-64: # BB#0:
+; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: retq
%res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %a
%res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %res1
ret <64 x i8> %res2
}
-; AVX512BW: .LCPI11
-; AVX512BW-NEXT: .byte 0 # 0x0
-; AVX512BW-NEXT: .byte 1 # 0x1
-; AVX512BW-NEXT: .byte 2 # 0x2
-; AVX512BW-NEXT: .byte 3 # 0x3
-; AVX512BW-NEXT: .byte 4 # 0x4
-; AVX512BW-NEXT: .byte 5 # 0x5
-; AVX512BW-NEXT: .byte 6 # 0x6
-; AVX512BW-NEXT: .byte 7 # 0x7
-; AVX512BW-NEXT: .byte 8 # 0x8
-; AVX512BW-NEXT: .byte 9 # 0x9
-; AVX512BW-NEXT: .byte 10 # 0xa
-; AVX512BW-NEXT: .byte 11 # 0xb
-; AVX512BW-NEXT: .byte 12 # 0xc
-; AVX512BW-NEXT: .byte 13 # 0xd
-; AVX512BW-NEXT: .byte 14 # 0xe
-; AVX512BW-NEXT: .byte 15 # 0xf
-; AVX512BW-NEXT: .byte 16 # 0x10
-; AVX512BW-NEXT: .byte 17 # 0x11
-; AVX512BW-NEXT: .byte 18 # 0x12
-; AVX512BW-NEXT: .byte 19 # 0x13
-; AVX512BW-NEXT: .byte 20 # 0x14
-; AVX512BW-NEXT: .byte 21 # 0x15
-; AVX512BW-NEXT: .byte 22 # 0x16
-; AVX512BW-NEXT: .byte 23 # 0x17
-; AVX512BW-NEXT: .byte 24 # 0x18
-; AVX512BW-NEXT: .byte 25 # 0x19
-; AVX512BW-NEXT: .byte 26 # 0x1a
-; AVX512BW-NEXT: .byte 27 # 0x1b
-; AVX512BW-NEXT: .byte 28 # 0x1c
-; AVX512BW-NEXT: .byte 29 # 0x1d
-; AVX512BW-NEXT: .byte 30 # 0x1e
-; AVX512BW-NEXT: .byte 31 # 0x1f
-; AVX512BW-NOT: .byte
-
define <64 x i8> @f64xi8_i256(<64 x i8> %a) {
+; AVX-LABEL: f64xi8_i256:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT: vpaddb %xmm4, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm4, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: retl
+;
+; NO-AVX512BW-LABEL: f64xi8_i256:
+; NO-AVX512BW: # BB#0:
+; NO-AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: retl
+;
; AVX512BW-LABEL: f64xi8_i256:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retl
+;
+; AVX-64-LABEL: f64xi8_i256:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-64-NEXT: vpaddb %xmm4, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddb %xmm4, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; NO-AVX512BW-64-LABEL: f64xi8_i256:
+; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: retq
+;
+; AVX512BW-64-LABEL: f64xi8_i256:
+; AVX512BW-64: # BB#0:
+; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: retq
%res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, %a
%res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, %res1
ret <64 x i8> %res2
}
-; ALL: .LCPI12
-; ALL-NEXT: .long 65536 # 0x10000
-
-; AVX: .LCPI12
-; AVX-NEXT: .long 65536 # float 9.18354962E-41
-
define <8 x i16> @f8xi16_i32(<8 x i16> %a) {
+; AVX-LABEL: f8xi16_i32:
+; AVX: # BB#0:
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f8xi16_i32:
; ALL32: # BB#0:
-; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1
+; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65536,65536,65536,65536]
; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f8xi16_i32:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
+; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f8xi16_i32:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65536,65536,65536,65536]
; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
-;
-; AVX-LABEL: f8xi16_i32:
-; AVX: # BB#0:
-; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm1
-; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
%res1 = add <8 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a
%res2 = and <8 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1
ret <8 x i16> %res2
}
-; ALL64: .LCPI13
-; ALL64-NEXT: .quad 844433520132096 # 0x3000200010000
-
-; ALL32: .LCPI13
-; ALL32-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309
-
-; AVX: .LCPI13
-; AVX-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309
-
define <8 x i16> @f8xi16_i64(<8 x i16> %a) {
+; AVX-LABEL: f8xi16_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f8xi16_i64:
; ALL32: # BB#0:
; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
@@ -471,67 +687,66 @@ define <8 x i16> @f8xi16_i64(<8 x i16> %a) {
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f8xi16_i64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f8xi16_i64:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [844433520132096,844433520132096]
; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
-;
-; AVX-LABEL: f8xi16_i64:
-; AVX: # BB#0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
%res1 = add <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a
%res2 = and <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1
ret <8 x i16> %res2
}
-; ALL: .LCPI14
-; ALL-NEXT: .long 65536 # 0x10000
-
-; AVX: .LCPI14
-; AVX-NEXT: .long 65536 # float 9.18354962E-41
-
define <16 x i16> @f16xi16_i32(<16 x i16> %a) {
-; ALL-LABEL: f16xi16_i32:
-; ALL: # BB#0:
-; ALL-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1
-; ALL-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0
-;
; AVX-LABEL: f16xi16_i32:
; AVX: # BB#0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm2
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; ALL32-LABEL: f16xi16_i32:
+; ALL32: # BB#0:
+; ALL32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65536,65536,65536,65536,65536,65536,65536,65536]
+; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: retl
+;
+; AVX-64-LABEL: f16xi16_i32:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
+; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: retq
+;
+; ALL64-LABEL: f16xi16_i32:
+; ALL64: # BB#0:
+; ALL64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65536,65536,65536,65536,65536,65536,65536,65536]
+; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: retq
%res1 = add <16 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a
%res2 = and <16 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1
ret <16 x i16> %res2
}
-; ALL64: .LCPI15
-; ALL64-NEXT: .quad 844433520132096 # 0x3000200010000
-
-; ALL32: .LCPI15
-; ALL32-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309
-
-; AVX: .LCPI15
-; AVX-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309
-
define <16 x i16> @f16xi16_i64(<16 x i16> %a) {
-; ALL-LABEL: f16xi16_i64:
-; ALL: # BB#0:
-; ALL-NEXT: vpbroadcastq {{.*}}, %ymm1
-; ALL-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0
-;
; AVX-LABEL: f16xi16_i64:
; AVX: # BB#0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
@@ -540,60 +755,154 @@ define <16 x i16> @f16xi16_i64(<16 x i16> %a) {
; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; ALL32-LABEL: f16xi16_i64:
+; ALL32: # BB#0:
+; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309]
+; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: retl
+;
+; AVX-64-LABEL: f16xi16_i64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: retq
+;
+; ALL64-LABEL: f16xi16_i64:
+; ALL64: # BB#0:
+; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [844433520132096,844433520132096,844433520132096,844433520132096]
+; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: retq
%res1 = add <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a
%res2 = and <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1
ret <16 x i16> %res2
}
-; ALL: .LCPI16
-; ALL-NEXT: .short 0 # 0x0
-; ALL-NEXT: .short 1 # 0x1
-; ALL-NEXT: .short 2 # 0x2
-; ALL-NEXT: .short 3 # 0x3
-; ALL-NEXT: .short 4 # 0x4
-; ALL-NEXT: .short 5 # 0x5
-; ALL-NEXT: .short 6 # 0x6
-; ALL-NEXT: .short 7 # 0x7
-; ALL-NOT: .short
-
define <16 x i16> @f16xi16_i128(<16 x i16> %a) {
-; ALL-LABEL: f16xi16_i128:
-; ALL: # BB#0:
-; ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; ALL-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX-LABEL: f16xi16_i128:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
+; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; ALL32-LABEL: f16xi16_i128:
+; ALL32: # BB#0:
+; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; ALL32-NEXT: # ymm1 = mem[0,1,0,1]
+; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: retl
+;
+; AVX-64-LABEL: f16xi16_i128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
+; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: retq
+;
+; ALL64-LABEL: f16xi16_i128:
+; ALL64: # BB#0:
+; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; ALL64-NEXT: # ymm1 = mem[0,1,0,1]
+; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: retq
%res1 = add <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %a
%res2 = and <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %res1
ret <16 x i16> %res2
}
-; ALL: .LCPI17
-; ALL-NEXT: .long 65536 # 0x10000
-
-; AVX: .LCPI17
-; AVX-NEXT: .long 65536 # float 9.18354962E-41
-
define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
+; AVX-LABEL: f32xi16_i32:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
+; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: retl
+;
; NO-AVX512BW-LABEL: f32xi16_i32:
; NO-AVX512BW: # BB#0:
-; NO-AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm2
+; NO-AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536]
; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f32xi16_i32:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %zmm1
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536]
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retl
;
-; AVX-LABEL: f32xi16_i32:
+; AVX-64-LABEL: f32xi16_i32:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm3 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
+; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; NO-AVX512BW-64-LABEL: f32xi16_i32:
+; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536]
+; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: retq
+;
+; AVX512BW-64-LABEL: f32xi16_i32:
+; AVX512BW-64: # BB#0:
+; AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536]
+; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: retq
+ %res1 = add <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a
+ %res2 = and <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1
+ ret <32 x i16> %res2
+}
+
+
+define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
+; AVX-LABEL: f32xi16_i64:
; AVX: # BB#0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm3
+; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -601,43 +910,69 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
- %res1 = add <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a
- %res2 = and <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1
- ret <32 x i16> %res2
-}
-
-
-; ALL64: .LCPI18
-; ALL64-NEXT: .quad 844433520132096 # 0x3000200010000
-
-; ALL32: .LCPI18
-; ALL32-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309
-
-; AVX: .LCPI18
-; AVX-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309
-
-define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
+; AVX-NEXT: retl
+;
; NO-AVX512BW-LABEL: f32xi16_i64:
; NO-AVX512BW: # BB#0:
-; NO-AVX512BW-NEXT: vpbroadcastq {{.*}}, %ymm2
+; NO-AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309]
; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f32xi16_i64:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpbroadcastq {{.*}}, %zmm1
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309]
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retl
;
-; AVX-LABEL: f32xi16_i64:
+; AVX-64-LABEL: f32xi16_i64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; NO-AVX512BW-64-LABEL: f32xi16_i64:
+; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096]
+; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: retq
+;
+; AVX512BW-64-LABEL: f32xi16_i64:
+; AVX512BW-64: # BB#0:
+; AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096]
+; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: retq
+ %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a
+ %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1
+ ret <32 x i16> %res2
+}
+
+
+define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
+; AVX-LABEL: f32xi16_i128:
; AVX: # BB#0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7]
; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -645,87 +980,151 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
- %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a
- %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1
- ret <32 x i16> %res2
-}
-
-
-; ALL: .LCPI19
-; ALL-NEXT: .short 0 # 0x0
-; ALL-NEXT: .short 1 # 0x1
-; ALL-NEXT: .short 2 # 0x2
-; ALL-NEXT: .short 3 # 0x3
-; ALL-NEXT: .short 4 # 0x4
-; ALL-NEXT: .short 5 # 0x5
-; ALL-NEXT: .short 6 # 0x6
-; ALL-NEXT: .short 7 # 0x7
-; ALL-NOT: .short
-
-define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
+; AVX-NEXT: retl
+;
; NO-AVX512BW-LABEL: f32xi16_i128:
; NO-AVX512BW: # BB#0:
-; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; NO-AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f32xi16_i128:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retl
+;
+; AVX-64-LABEL: f32xi16_i128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7]
+; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; NO-AVX512BW-64-LABEL: f32xi16_i128:
+; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; NO-AVX512BW-64-NEXT: # ymm2 = mem[0,1,0,1]
+; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: retq
+;
+; AVX512BW-64-LABEL: f32xi16_i128:
+; AVX512BW-64: # BB#0:
+; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: retq
%res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %a
%res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %res1
ret <32 x i16> %res2
}
-; AVX512BW: .LCPI20
-; AVX512BW-NEXT: .short 0 # 0x0
-; AVX512BW-NEXT: .short 1 # 0x1
-; AVX512BW-NEXT: .short 2 # 0x2
-; AVX512BW-NEXT: .short 3 # 0x3
-; AVX512BW-NEXT: .short 4 # 0x4
-; AVX512BW-NEXT: .short 5 # 0x5
-; AVX512BW-NEXT: .short 6 # 0x6
-; AVX512BW-NEXT: .short 7 # 0x7
-; AVX512BW-NEXT: .short 8 # 0x8
-; AVX512BW-NEXT: .short 9 # 0x9
-; AVX512BW-NEXT: .short 10 # 0xa
-; AVX512BW-NEXT: .short 11 # 0xb
-; AVX512BW-NEXT: .short 12 # 0xc
-; AVX512BW-NEXT: .short 13 # 0xd
-; AVX512BW-NEXT: .short 14 # 0xe
-; AVX512BW-NEXT: .short 15 # 0xf
-; AVX512BW-NOT: .short
-
define <32 x i16> @f32xi16_i256(<32 x i16> %a) {
+; AVX-LABEL: f32xi16_i256:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15]
+; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7]
+; AVX-NEXT: vpaddw %xmm4, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddw %xmm4, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: retl
+;
+; NO-AVX512BW-LABEL: f32xi16_i256:
+; NO-AVX512BW: # BB#0:
+; NO-AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-NEXT: retl
+;
; AVX512BW-LABEL: f32xi16_i256:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retl
+;
+; AVX-64-LABEL: f32xi16_i256:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15]
+; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7]
+; AVX-64-NEXT: vpaddw %xmm4, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddw %xmm4, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; NO-AVX512BW-64-LABEL: f32xi16_i256:
+; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NO-AVX512BW-64-NEXT: retq
+;
+; AVX512BW-64-LABEL: f32xi16_i256:
+; AVX512BW-64: # BB#0:
+; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-64-NEXT: retq
%res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a
%res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %res1
ret <32 x i16> %res2
}
-; ALL64: .LCPI21
-; ALL64-NEXT: .quad 4294967296 # 0x100000000
-
-; ALL32: .LCPI21
-; ALL32-NEXT: .quad 4294967296 # double 2.1219957909652723E-314
-
-; AVX: .LCPI21
-; AVX-NEXT: .quad 4294967296 # double 2.1219957909652723E-314
define <4 x i32> @f4xi32_i64(<4 x i32> %a) {
+; AVX-LABEL: f4xi32_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f4xi32_i64:
; ALL32: # BB#0:
; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
@@ -733,40 +1132,26 @@ define <4 x i32> @f4xi32_i64(<4 x i32> %a) {
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f4xi32_i64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f4xi32_i64:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967296,4294967296]
; ALL64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
-;
-; AVX-LABEL: f4xi32_i64:
-; AVX: # BB#0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
%res1 = add <4 x i32> <i32 0, i32 1, i32 0, i32 1>, %a
%res2 = and <4 x i32> <i32 0, i32 1, i32 0, i32 1>, %res1
ret <4 x i32> %res2
}
-; ALL64: .LCPI22
-; ALL64-NEXT: .quad 4294967296 # 0x100000000
-
-; ALL32: .LCPI22
-; ALL32-NEXT: .quad 4294967296 # double 2.1219957909652723E-314
-
-; AVX: .LCPI22
-; AVX-NEXT: .quad 4294967296 # double 2.1219957909652723E-314
-
define <8 x i32> @f8xi32_i64(<8 x i32> %a) {
-; ALL-LABEL: f8xi32_i64:
-; ALL: # BB#0:
-; ALL-NEXT: vpbroadcastq {{.*}}, %ymm1
-; ALL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0
-;
; AVX-LABEL: f8xi32_i64:
; AVX: # BB#0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
@@ -775,59 +1160,154 @@ define <8 x i32> @f8xi32_i64(<8 x i32> %a) {
; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; ALL32-LABEL: f8xi32_i64:
+; ALL32: # BB#0:
+; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314]
+; ALL32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: retl
+;
+; AVX-64-LABEL: f8xi32_i64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: retq
+;
+; ALL64-LABEL: f8xi32_i64:
+; ALL64: # BB#0:
+; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967296,4294967296,4294967296,4294967296]
+; ALL64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: retq
%res1 = add <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a
%res2 = and <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1
ret <8 x i32> %res2
}
-; ALL: .LCPI23
-; ALL-NEXT: .long 0 # 0x0
-; ALL-NEXT: .long 1 # 0x1
-; ALL-NEXT: .long 2 # 0x2
-; ALL-NEXT: .long 3 # 0x3
-; ALL-NOT: .long
-
define <8 x i32> @f8xi32_i128(<8 x i32> %a) {
-; ALL-LABEL: f8xi32_i128:
-; ALL: # BB#0:
-; ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; ALL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX-LABEL: f8xi32_i128:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3]
+; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; ALL32-LABEL: f8xi32_i128:
+; ALL32: # BB#0:
+; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3]
+; ALL32-NEXT: # ymm1 = mem[0,1,0,1]
+; ALL32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: retl
+;
+; AVX-64-LABEL: f8xi32_i128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3]
+; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: retq
+;
+; ALL64-LABEL: f8xi32_i128:
+; ALL64: # BB#0:
+; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3]
+; ALL64-NEXT: # ymm1 = mem[0,1,0,1]
+; ALL64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: retq
%res1 = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %a
%res2 = and <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %res1
ret <8 x i32> %res2
}
-; ALL64: .LCPI24
-; ALL64-NEXT: .quad 4294967296 # 0x100000000
-
-; ALL32: .LCPI24
-; ALL32-NEXT: .quad 4294967296 # double 2.1219957909652723E-314
-
-; AVX: .LCPI24
-; AVX-NEXT: .quad 4294967296 # double 2.1219957909652723E-314
-
define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
+; AVX-LABEL: f16xi32_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddd %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1]
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: retl
+;
; AVX2-LABEL: f16xi32_i64:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastq {{.*}}, %ymm2
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314]
; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retl
;
; AVX512-LABEL: f16xi32_i64:
; AVX512: # BB#0:
-; AVX512-NEXT: vpbroadcastq {{.*}}, %zmm1
+; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314]
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: retl
;
-; AVX-LABEL: f16xi32_i64:
+; AVX-64-LABEL: f16xi32_i64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddd %xmm3, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; AVX2-64-LABEL: f16xi32_i64:
+; AVX2-64: # BB#0:
+; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296]
+; AVX2-64-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: retq
+;
+; AVX512F-64-LABEL: f16xi32_i64:
+; AVX512F-64: # BB#0:
+; AVX512F-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296]
+; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: retq
+ %res1 = add <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a
+ %res2 = and <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1
+ ret <16 x i32> %res2
+}
+
+
+define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
+; AVX-LABEL: f16xi32_i128:
; AVX: # BB#0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3]
; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddd %xmm3, %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -835,51 +1315,103 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddd %xmm3, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1]
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
- %res1 = add <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a
- %res2 = and <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1
- ret <16 x i32> %res2
-}
-
-
-; ALL: .LCPI25
-; ALL-NEXT: .long 0 # 0x0
-; ALL-NEXT: .long 1 # 0x1
-; ALL-NEXT: .long 2 # 0x2
-; ALL-NEXT: .long 3 # 0x3
-; ALL-NOT: .long
-
-define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
+; AVX-NEXT: retl
+;
; AVX2-LABEL: f16xi32_i128:
; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retl
;
; AVX512-LABEL: f16xi32_i128:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: retl
+;
+; AVX-64-LABEL: f16xi32_i128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3]
+; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddd %xmm3, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; AVX2-64-LABEL: f16xi32_i128:
+; AVX2-64: # BB#0:
+; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
+; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX2-64-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: retq
+;
+; AVX512F-64-LABEL: f16xi32_i128:
+; AVX512F-64: # BB#0:
+; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: retq
%res1 = add <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %a
%res2 = and <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %res1
ret <16 x i32> %res2
}
-; ALL64: .LCPI26
-; ALL64-NEXT: .quad 0 # 0x0
-; ALL64-NEXT: .quad 1 # 0x1
-; ALL64-NOT: .quad
-
define <4 x i64> @f4xi64_i128(<4 x i64> %a) {
+; AVX-LABEL: f4xi64_i128:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0]
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; ALL32-LABEL: f4xi64_i128:
+; ALL32: # BB#0:
+; ALL32-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0]
+; ALL32-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: retl
+;
+; AVX-64-LABEL: f4xi64_i128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-64-NEXT: movl $1, %eax
+; AVX-64-NEXT: vmovq %rax, %xmm2
+; AVX-64-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; AVX-64-NEXT: vpaddq %xmm2, %xmm1, %xmm1
+; AVX-64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f4xi64_i128:
; ALL64: # BB#0:
-; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,0,1]
+; ALL64-NEXT: # ymm1 = mem[0,1,0,1]
; ALL64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT: retq
@@ -889,15 +1421,62 @@ define <4 x i64> @f4xi64_i128(<4 x i64> %a) {
}
-; ALL64: .LCPI27
-; ALL64-NEXT: .quad 0 # 0x0
-; ALL64-NEXT: .quad 1 # 0x1
-; ALL64-NOT: .quad
-
define <8 x i64> @f8xi64_i128(<8 x i64> %a) {
+; AVX-LABEL: f8xi64_i128:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0]
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX-NEXT: vpaddq %xmm3, %xmm4, %xmm4
+; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX-NEXT: vpaddq %xmm3, %xmm4, %xmm3
+; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: retl
+;
+; AVX2-LABEL: f8xi64_i128:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0]
+; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retl
+;
+; AVX512-LABEL: f8xi64_i128:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0]
+; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: retl
+;
+; AVX-64-LABEL: f8xi64_i128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: movl $1, %eax
+; AVX-64-NEXT: vmovq %rax, %xmm3
+; AVX-64-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7]
+; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddq %xmm3, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddq %xmm3, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,0,1]
+; AVX-64-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
; AVX2-64-LABEL: f8xi64_i128:
; AVX2-64: # BB#0:
-; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,0,1]
+; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-64-NEXT: vpaddq %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
@@ -906,57 +1485,99 @@ define <8 x i64> @f8xi64_i128(<8 x i64> %a) {
;
; AVX512F-64-LABEL: f8xi64_i128:
; AVX512F-64: # BB#0:
-; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,0,1,0,1,0,1]
+; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT: retq
-;
-; AVX512BW-64-LABEL: f8xi64_i128:
-; AVX512BW-64: # BB#0:
-; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-64-NEXT: retq
%res1 = add <8 x i64> <i64 0, i64 1, i64 0, i64 1, i64 0, i64 1, i64 0, i64 1>, %a
%res2 = and <8 x i64> <i64 0, i64 1, i64 0, i64 1, i64 0, i64 1, i64 0, i64 1>, %res1
ret <8 x i64> %res2
}
-; ALL64: .LCPI28
-; ALL64-NEXT: .quad 0 # 0x0
-; ALL64-NEXT: .quad 1 # 0x1
-; ALL64-NEXT: .quad 2 # 0x2
-; ALL64-NEXT: .quad 3 # 0x3
-; ALL64-NOT: .quad
-
define <8 x i64> @f8xi64_i256(<8 x i64> %a) {
+; AVX-LABEL: f8xi64_i256:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0]
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX-NEXT: vpaddq %xmm3, %xmm4, %xmm4
+; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX-NEXT: vpaddq %xmm3, %xmm4, %xmm3
+; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: retl
+;
+; AVX2-LABEL: f8xi64_i256:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0]
+; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retl
+;
+; AVX512-LABEL: f8xi64_i256:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,0,0,1,0,2,0,3,0]
+; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: retl
+;
+; AVX-64-LABEL: f8xi64_i256:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3]
+; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: movl $1, %eax
+; AVX-64-NEXT: vmovq %rax, %xmm4
+; AVX-64-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
+; AVX-64-NEXT: vpaddq %xmm4, %xmm1, %xmm1
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddq %xmm4, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3]
+; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: retq
+;
+; AVX2-64-LABEL: f8xi64_i256:
+; AVX2-64: # BB#0:
+; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3]
+; AVX2-64-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: retq
+;
; AVX512F-64-LABEL: f8xi64_i256:
; AVX512F-64: # BB#0:
-; AVX512F-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT: retq
-;
-; AVX512BW-64-LABEL: f8xi64_i256:
-; AVX512BW-64: # BB#0:
-; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-64-NEXT: retq
%res1 = add <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>, %a
%res2 = and <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>, %res1
ret <8 x i64> %res2
}
-; ALL: .LCPI29
-; ALL-NEXT: .quad 4575657222482165760
-
-; AVX: .LCPI29
-; AVX-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492
-
define <4 x float> @f4xf32_f64(<4 x float> %a) {
+; AVX-LABEL: f4xf32_f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f4xf32_f64:
; ALL32: # BB#0:
; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
@@ -964,221 +1585,367 @@ define <4 x float> @f4xf32_f64(<4 x float> %a) {
; ALL32-NEXT: vdivps %xmm0, %xmm1, %xmm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f4xf32_f64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-64-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f4xf32_f64:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760]
; ALL64-NEXT: vaddps %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vdivps %xmm0, %xmm1, %xmm0
; ALL64-NEXT: retq
-;
-; AVX-LABEL: f4xf32_f64:
-; AVX: # BB#0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
-; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0
%res1 = fadd <4 x float> <float 2.0, float 1.0, float 2.0, float 1.0>, %a
%res2 = fdiv <4 x float> <float 2.0, float 1.0, float 2.0, float 1.0>, %res1
ret <4 x float> %res2
}
-; ALL64: .LCPI30
-; ALL64-NEXT: .quad 4575657222482165760 # 0x3f80000040000000
-
-; ALL32: .LCPI30
-; ALL32-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492
-
-; AVX: .LCPI30
-; AVX-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492
-
define <8 x float> @f8xf32_f64(<8 x float> %a) {
-; ALL-LABEL: f8xf32_f64:
-; ALL: # BB#0:
-; ALL-NEXT: vbroadcastsd {{.*}}, %ymm1
-; ALL-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; ALL-NEXT: vdivps %ymm0, %ymm1, %ymm0
-;
; AVX-LABEL: f8xf32_f64:
; AVX: # BB#0:
-; AVX-NEXT: vbroadcastsd {{\.LCPI.*}}, %ymm1
+; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: retl
+;
+; ALL32-LABEL: f8xf32_f64:
+; ALL32: # BB#0:
+; ALL32-NEXT: vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; ALL32-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; ALL32-NEXT: retl
+;
+; AVX-64-LABEL: f8xf32_f64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; AVX-64-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX-64-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT: retq
+;
+; ALL64-LABEL: f8xf32_f64:
+; ALL64: # BB#0:
+; ALL64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
+; ALL64-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; ALL64-NEXT: retq
%res1 = fadd <8 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %a
%res2 = fdiv <8 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %res1
ret <8 x float> %res2
}
-; ALL: .LCPI31
-; ALL-NEXT: .long 1082130432 # float 4
-; ALL-NEXT: .long 1065353216 # float 1
-; ALL-NEXT: .long 1073741824 # float 2
-; ALL-NEXT: .long 1077936128 # float 3
-; ALL-NOT: .long
-
define <8 x float> @f8xf32_f128(<8 x float> %a) {
-; ALL-LABEL: f8xf32_f128:
-; ALL: # BB#0:
-; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; ALL-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; ALL-NEXT: vdivps %ymm0, %ymm1, %ymm0
-;
; AVX-LABEL: f8xf32_f128:
; AVX: # BB#0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX-NEXT: # ymm1 = mem[0,1,0,1]
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: retl
+;
+; ALL32-LABEL: f8xf32_f128:
+; ALL32: # BB#0:
+; ALL32-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; ALL32-NEXT: # ymm1 = mem[0,1,0,1]
+; ALL32-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; ALL32-NEXT: retl
+;
+; AVX-64-LABEL: f8xf32_f128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX-64-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX-64-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX-64-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT: retq
+;
+; ALL64-LABEL: f8xf32_f128:
+; ALL64: # BB#0:
+; ALL64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; ALL64-NEXT: # ymm1 = mem[0,1,0,1]
+; ALL64-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; ALL64-NEXT: retq
%res1 = fadd <8 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %a
%res2 = fdiv <8 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %res1
ret <8 x float> %res2
}
-; ALL64: .LCPI32
-; ALL64-NEXT: .quad 4575657222482165760 # 0x3f80000040000000
-
-; ALL32: .LCPI32
-; ALL32-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492
-
-; AVX: .LCPI32
-; AVX-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492
-
define <16 x float> @f16xf32_f64(<16 x float> %a) {
+; AVX-LABEL: f16xf32_f64:
+; AVX: # BB#0:
+; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX-NEXT: retl
+;
; AVX2-LABEL: f16xf32_f64:
; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcastsd {{.*}}, %ymm2
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: retl
;
; AVX512-LABEL: f16xf32_f64:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastsd {{.*}}, %zmm1
+; AVX512-NEXT: vbroadcastsd {{.*#+}} zmm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: retl
;
-; AVX-LABEL: f16xf32_f64:
-; AVX: # BB#0:
-; AVX-NEXT: vbroadcastsd {{\.LCPI.*}}, %ymm2
-; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1
-; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0
-; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0
-; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX-64-LABEL: f16xf32_f64:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX-64-NEXT: retq
+;
+; AVX2-64-LABEL: f16xf32_f64:
+; AVX2-64: # BB#0:
+; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
+; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX2-64-NEXT: retq
+;
+; AVX512F-64-LABEL: f16xf32_f64:
+; AVX512F-64: # BB#0:
+; AVX512F-64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
+; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0
+; AVX512F-64-NEXT: retq
%res1 = fadd <16 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %a
%res2 = fdiv <16 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %res1
ret <16 x float> %res2
}
-; ALL: .LCPI33
-; ALL-NEXT: .long 1082130432 # float 4
-; ALL-NEXT: .long 1065353216 # float 1
-; ALL-NEXT: .long 1073741824 # float 2
-; ALL-NEXT: .long 1077936128 # float 3
-; ALL-NOT: .long
-
define <16 x float> @f16xf32_f128(<16 x float> %a) {
+; AVX-LABEL: f16xf32_f128:
+; AVX: # BB#0:
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX-NEXT: retl
+;
; AVX2-LABEL: f16xf32_f128:
; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: retl
;
; AVX512-LABEL: f16xf32_f128:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: retl
;
-; AVX-LABEL: f16xf32_f128:
-; AVX: # BB#0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
-; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1
-; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0
-; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0
-; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX-64-LABEL: f16xf32_f128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX-64-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX-64-NEXT: retq
+;
+; AVX2-64-LABEL: f16xf32_f128:
+; AVX2-64: # BB#0:
+; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX2-64-NEXT: retq
+;
+; AVX512F-64-LABEL: f16xf32_f128:
+; AVX512F-64: # BB#0:
+; AVX512F-64-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0
+; AVX512F-64-NEXT: retq
%res1 = fadd <16 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %a
%res2 = fdiv <16 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %res1
ret <16 x float> %res2
}
-; AVX512: .LCPI34
-; AVX512-NEXT: .long 1090519040 # float 8
-; AVX512-NEXT: .long 1065353216 # float 1
-; AVX512-NEXT: .long 1073741824 # float 2
-; AVX512-NEXT: .long 1077936128 # float 3
-; AVX512-NEXT: .long 1082130432 # float 4
-; AVX512-NEXT: .long 1084227584 # float 5
-; AVX512-NEXT: .long 1086324736 # float 6
-; AVX512-NEXT: .long 1088421888 # float 7
-; AVX512-NOT: .long
-
define <16 x float> @f16xf32_f256(<16 x float> %a) {
+; AVX-LABEL: f16xf32_f256:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
+; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX-NEXT: retl
+;
+; AVX2-LABEL: f16xf32_f256:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
+; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: retl
+;
; AVX512-LABEL: f16xf32_f256:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
+; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: retl
+;
+; AVX-64-LABEL: f16xf32_f256:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
+; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX-64-NEXT: retq
+;
+; AVX2-64-LABEL: f16xf32_f256:
+; AVX2-64: # BB#0:
+; AVX2-64-NEXT: vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
+; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0
+; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1
+; AVX2-64-NEXT: retq
+;
+; AVX512F-64-LABEL: f16xf32_f256:
+; AVX512F-64: # BB#0:
+; AVX512F-64-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
+; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0
+; AVX512F-64-NEXT: retq
%res1 = fadd <16 x float> <float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, %a
%res2 = fdiv <16 x float> <float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, %res1
ret <16 x float> %res2
}
-; ALL: .LCPI35
-; ALL-NEXT: .quad 4611686018427387904 # double 2
-; ALL-NEXT: .quad 4607182418800017408 # double 1
-; ALL-NOT: .quad
-
define <4 x double> @f4xf64_f128(<4 x double> %a) {
-; ALL-LABEL: f4xf64_f128:
-; ALL: # BB#0:
-; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; ALL-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; ALL-NEXT: vdivpd %ymm0, %ymm1, %ymm0
-;
; AVX-LABEL: f4xf64_f128:
; AVX: # BB#0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
+; AVX-NEXT: # ymm1 = mem[0,1,0,1]
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: vdivpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT: retl
+;
+; ALL32-LABEL: f4xf64_f128:
+; ALL32: # BB#0:
+; ALL32-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
+; ALL32-NEXT: # ymm1 = mem[0,1,0,1]
+; ALL32-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; ALL32-NEXT: vdivpd %ymm0, %ymm1, %ymm0
+; ALL32-NEXT: retl
+;
+; AVX-64-LABEL: f4xf64_f128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
+; AVX-64-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX-64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX-64-NEXT: vdivpd %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT: retq
+;
+; ALL64-LABEL: f4xf64_f128:
+; ALL64: # BB#0:
+; ALL64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
+; ALL64-NEXT: # ymm1 = mem[0,1,0,1]
+; ALL64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; ALL64-NEXT: vdivpd %ymm0, %ymm1, %ymm0
+; ALL64-NEXT: retq
%res1 = fadd <4 x double> <double 2.0, double 1.0, double 2.0, double 1.0>, %a
%res2 = fdiv <4 x double> <double 2.0, double 1.0, double 2.0, double 1.0>, %res1
ret <4 x double> %res2
}
-; ALL: .LCPI36
-; ALL-NEXT: .quad 4611686018427387904 # double 2
-; ALL-NEXT: .quad 4607182418800017408 # double 1
-; ALL-NOT: .quad
-
define <8 x double> @f8xf64_f128(<8 x double> %a) {
+; AVX-LABEL: f8xf64_f128:
+; AVX: # BB#0:
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
+; AVX-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vdivpd %ymm1, %ymm2, %ymm1
+; AVX-NEXT: retl
+;
; AVX2-LABEL: f8xf64_f128:
; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vdivpd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vdivpd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: retl
;
; AVX512-LABEL: f8xf64_f128:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
+; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vdivpd %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: retl
;
-; AVX-LABEL: f8xf64_f128:
-; AVX: # BB#0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
-; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1
-; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0
-; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0
-; AVX-NEXT: vdivpd %ymm1, %ymm2, %ymm1
+; AVX-64-LABEL: f8xf64_f128:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
+; AVX-64-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0
+; AVX-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1
+; AVX-64-NEXT: retq
+;
+; AVX2-64-LABEL: f8xf64_f128:
+; AVX2-64: # BB#0:
+; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
+; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX2-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0
+; AVX2-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1
+; AVX2-64-NEXT: retq
+;
+; AVX512F-64-LABEL: f8xf64_f128:
+; AVX512F-64: # BB#0:
+; AVX512F-64-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
+; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: vdivpd %zmm0, %zmm1, %zmm0
+; AVX512F-64-NEXT: retq
%res1 = fadd <8 x double> <double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0>, %a
%res2 = fdiv <8 x double> <double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0>, %res1
ret <8 x double> %res2
@@ -1193,11 +1960,57 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
; AVX512-NOT: .quad
define <8 x double> @f8xf64_f256(<8 x double> %a) {
+; AVX-LABEL: f8xf64_f256:
+; AVX: # BB#0:
+; AVX-NEXT: vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vdivpd %ymm1, %ymm2, %ymm1
+; AVX-NEXT: retl
+;
+; AVX2-LABEL: f8xf64_f256:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vdivpd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vdivpd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: retl
+;
; AVX512-LABEL: f8xf64_f256:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vdivpd %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: retl
+;
+; AVX-64-LABEL: f8xf64_f256:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1
+; AVX-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0
+; AVX-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1
+; AVX-64-NEXT: retq
+;
+; AVX2-64-LABEL: f8xf64_f256:
+; AVX2-64: # BB#0:
+; AVX2-64-NEXT: vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX2-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0
+; AVX2-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1
+; AVX2-64-NEXT: retq
+;
+; AVX512F-64-LABEL: f8xf64_f256:
+; AVX512F-64: # BB#0:
+; AVX512F-64-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
+; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-64-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: vdivpd %zmm0, %zmm1, %zmm0
+; AVX512F-64-NEXT: retq
%res1 = fadd <8 x double> <double 4.0, double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0>, %a
%res2 = fdiv <8 x double> <double 4.0, double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0>, %res1
ret <8 x double> %res2
@@ -1205,32 +2018,34 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) {
-; ALL: .LCPI38
-; ALL-NEXT: .long 4290379776 # 0xffba0000
-
-; AVX: .LCPI38
-; AVX-NEXT: .long 4290379776 # float NaN
-
define <8 x i16> @f8xi16_i32_NaN(<8 x i16> %a) {
+; AVX-LABEL: f8xi16_i32_NaN:
+; AVX: # BB#0:
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
; ALL32-LABEL: f8xi16_i32_NaN:
; ALL32: # BB#0:
-; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1
+; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776]
; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
+; AVX-64-LABEL: f8xi16_i32_NaN:
+; AVX-64: # BB#0:
+; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
+; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: retq
+;
; ALL64-LABEL: f8xi16_i32_NaN:
; ALL64: # BB#0:
-; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776]
; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT: retq
-;
-; AVX-LABEL: f8xi16_i32_NaN:
-; AVX: # BB#0:
-; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm1
-; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
%res1 = add <8 x i16> <i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70>, %a
%res2 = and <8 x i16> <i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70>, %res1
ret <8 x i16> %res2
diff --git a/test/CodeGen/X86/bswap-wide-int.ll b/test/CodeGen/X86/bswap-wide-int.ll
index db48eb80de4b..858dbf5fd85f 100644
--- a/test/CodeGen/X86/bswap-wide-int.ll
+++ b/test/CodeGen/X86/bswap-wide-int.ll
@@ -71,8 +71,8 @@ define i128 @bswap_i128(i128 %a0) nounwind {
; X86-MOVBE-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-MOVBE-NEXT: movbel %esi, 12(%eax)
; X86-MOVBE-NEXT: movbel %edi, 8(%eax)
-; X86-MOVBE-NEXT: movbel %ecx, 4(%eax)
-; X86-MOVBE-NEXT: movbel %edx, (%eax)
+; X86-MOVBE-NEXT: movbel %edx, 4(%eax)
+; X86-MOVBE-NEXT: movbel %ecx, (%eax)
; X86-MOVBE-NEXT: popl %esi
; X86-MOVBE-NEXT: popl %edi
; X86-MOVBE-NEXT: retl $4
diff --git a/test/CodeGen/X86/build-vector-128.ll b/test/CodeGen/X86/build-vector-128.ll
index c73d7654045e..531c6de5f90c 100644
--- a/test/CodeGen/X86/build-vector-128.ll
+++ b/test/CodeGen/X86/build-vector-128.ll
@@ -72,12 +72,10 @@ define <4 x float> @test_buildvector_v4f32(float %a0, float %a1, float %a2, floa
}
define <2 x i64> @test_buildvector_v2i64(i64 %a0, i64 %a1) {
-; SSE2-32-LABEL: test_buildvector_v2i64:
-; SSE2-32: # BB#0:
-; SSE2-32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-32-NEXT: retl
+; SSE-32-LABEL: test_buildvector_v2i64:
+; SSE-32: # BB#0:
+; SSE-32-NEXT: movups {{[0-9]+}}(%esp), %xmm0
+; SSE-32-NEXT: retl
;
; SSE-64-LABEL: test_buildvector_v2i64:
; SSE-64: # BB#0:
@@ -86,20 +84,9 @@ define <2 x i64> @test_buildvector_v2i64(i64 %a0, i64 %a1) {
; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-64-NEXT: retq
;
-; SSE41-32-LABEL: test_buildvector_v2i64:
-; SSE41-32: # BB#0:
-; SSE41-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-32-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0
-; SSE41-32-NEXT: pinsrd $2, {{[0-9]+}}(%esp), %xmm0
-; SSE41-32-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm0
-; SSE41-32-NEXT: retl
-;
; AVX-32-LABEL: test_buildvector_v2i64:
; AVX-32: # BB#0:
-; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_buildvector_v2i64:
diff --git a/test/CodeGen/X86/build-vector-256.ll b/test/CodeGen/X86/build-vector-256.ll
index 1ced1fc3a382..942b7779abe6 100644
--- a/test/CodeGen/X86/build-vector-256.ll
+++ b/test/CodeGen/X86/build-vector-256.ll
@@ -51,18 +51,10 @@ define <8 x float> @test_buildvector_v8f32(float %a0, float %a1, float %a2, floa
}
define <4 x i64> @test_buildvector_v4i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3) {
-; AVX1-32-LABEL: test_buildvector_v4i64:
-; AVX1-32: # BB#0:
-; AVX1-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX1-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX1-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX1-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX1-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX1-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX1-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-32-NEXT: retl
+; AVX-32-LABEL: test_buildvector_v4i64:
+; AVX-32: # BB#0:
+; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0
+; AVX-32-NEXT: retl
;
; AVX1-64-LABEL: test_buildvector_v4i64:
; AVX1-64: # BB#0:
@@ -75,19 +67,6 @@ define <4 x i64> @test_buildvector_v4i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3) {
; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-64-NEXT: retq
;
-; AVX2-32-LABEL: test_buildvector_v4i64:
-; AVX2-32: # BB#0:
-; AVX2-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX2-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX2-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX2-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX2-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX2-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX2-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-32-NEXT: retl
-;
; AVX2-64-LABEL: test_buildvector_v4i64:
; AVX2-64: # BB#0:
; AVX2-64-NEXT: vmovq %rcx, %xmm0
diff --git a/test/CodeGen/X86/build-vector-512.ll b/test/CodeGen/X86/build-vector-512.ll
index 21737cca93a1..fbfbf2d53c63 100644
--- a/test/CodeGen/X86/build-vector-512.ll
+++ b/test/CodeGen/X86/build-vector-512.ll
@@ -79,25 +79,7 @@ define <16 x float> @test_buildvector_v16f32(float %a0, float %a1, float %a2, fl
define <8 x i64> @test_buildvector_v8i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7) {
; AVX-32-LABEL: test_buildvector_v8i64:
; AVX-32: # BB#0:
-; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %zmm0
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_buildvector_v8i64:
diff --git a/test/CodeGen/X86/cast-vsel.ll b/test/CodeGen/X86/cast-vsel.ll
index 83ab2fac2f16..260535985e2d 100644
--- a/test/CodeGen/X86/cast-vsel.ll
+++ b/test/CodeGen/X86/cast-vsel.ll
@@ -148,7 +148,7 @@ define <4 x double> @fpext(<4 x double> %a, <4 x double> %b, <4 x float> %c, <4
; SSE2-NEXT: andnps %xmm5, %xmm0
; SSE2-NEXT: orps %xmm4, %xmm0
; SSE2-NEXT: cvtps2pd %xmm0, %xmm2
-; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: cvtps2pd %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
index a6bc5aa321fa..e2a4368b255a 100644
--- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll
+++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
@@ -1063,87 +1063,89 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
;
; AVX1-LABEL: _clearupper32xi8b:
; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: pushq %r15
; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: pushq %r13
+; AVX1-NEXT: pushq %r12
; AVX1-NEXT: pushq %rbx
-; AVX1-NEXT: vpextrq $1, %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %r14
+; AVX1-NEXT: vmovq %xmm0, %rcx
+; AVX1-NEXT: movq %rcx, %r8
+; AVX1-NEXT: movq %rcx, %r9
+; AVX1-NEXT: movq %rcx, %r10
+; AVX1-NEXT: movq %rcx, %r11
+; AVX1-NEXT: movq %rcx, %r14
+; AVX1-NEXT: movq %rcx, %r15
; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX1-NEXT: movq %rdx, %r8
-; AVX1-NEXT: movq %rdx, %r9
-; AVX1-NEXT: movq %rdx, %r11
-; AVX1-NEXT: movq %rdx, %rsi
-; AVX1-NEXT: movq %rdx, %rdi
-; AVX1-NEXT: movq %rdx, %rcx
+; AVX1-NEXT: movq %rdx, %r12
+; AVX1-NEXT: movq %rdx, %r13
+; AVX1-NEXT: movq %rdx, %rbx
; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: movq %rdx, %rdi
+; AVX1-NEXT: movq %rdx, %rsi
+; AVX1-NEXT: movq %rdx, %rbp
; AVX1-NEXT: andb $15, %dl
; AVX1-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: shrq $56, %rax
-; AVX1-NEXT: andb $15, %al
-; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movq %r14, %r10
-; AVX1-NEXT: shrq $48, %rcx
+; AVX1-NEXT: movq %rcx, %rdx
; AVX1-NEXT: andb $15, %cl
; AVX1-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movq %r14, %rdx
-; AVX1-NEXT: shrq $40, %rdi
-; AVX1-NEXT: andb $15, %dil
-; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movq %r14, %rax
-; AVX1-NEXT: shrq $32, %rsi
+; AVX1-NEXT: shrq $56, %rbp
+; AVX1-NEXT: andb $15, %bpl
+; AVX1-NEXT: movb %bpl, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: shrq $48, %rsi
; AVX1-NEXT: andb $15, %sil
; AVX1-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movq %r14, %rcx
-; AVX1-NEXT: shrq $24, %r11
-; AVX1-NEXT: andb $15, %r11b
-; AVX1-NEXT: movb %r11b, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movq %r14, %rsi
-; AVX1-NEXT: shrq $16, %r9
-; AVX1-NEXT: andb $15, %r9b
-; AVX1-NEXT: movb %r9b, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movq %r14, %rdi
-; AVX1-NEXT: shrq $8, %r8
-; AVX1-NEXT: andb $15, %r8b
-; AVX1-NEXT: movb %r8b, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movq %r14, %rbx
-; AVX1-NEXT: andb $15, %r14b
-; AVX1-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: shrq $8, %r10
-; AVX1-NEXT: shrq $16, %rdx
-; AVX1-NEXT: shrq $24, %rax
-; AVX1-NEXT: shrq $32, %rcx
-; AVX1-NEXT: shrq $40, %rsi
-; AVX1-NEXT: shrq $48, %rdi
-; AVX1-NEXT: shrq $56, %rbx
-; AVX1-NEXT: andb $15, %bl
-; AVX1-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: shrq $40, %rdi
; AVX1-NEXT: andb $15, %dil
; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: andb $15, %sil
-; AVX1-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: andb $15, %cl
-; AVX1-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: shrq $32, %rax
; AVX1-NEXT: andb $15, %al
; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: shrq $24, %rbx
+; AVX1-NEXT: andb $15, %bl
+; AVX1-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: shrq $16, %r13
+; AVX1-NEXT: andb $15, %r13b
+; AVX1-NEXT: movb %r13b, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: shrq $8, %r12
+; AVX1-NEXT: andb $15, %r12b
+; AVX1-NEXT: movb %r12b, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: shrq $8, %r8
+; AVX1-NEXT: shrq $16, %r9
+; AVX1-NEXT: shrq $24, %r10
+; AVX1-NEXT: shrq $32, %r11
+; AVX1-NEXT: shrq $40, %r14
+; AVX1-NEXT: shrq $48, %r15
+; AVX1-NEXT: shrq $56, %rdx
; AVX1-NEXT: andb $15, %dl
; AVX1-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: andb $15, %r15b
+; AVX1-NEXT: movb %r15b, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: andb $15, %r14b
+; AVX1-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: andb $15, %r11b
+; AVX1-NEXT: movb %r11b, -{{[0-9]+}}(%rsp)
; AVX1-NEXT: andb $15, %r10b
; AVX1-NEXT: movb %r10b, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: andb $15, %r9b
+; AVX1-NEXT: movb %r9b, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: andb $15, %r8b
+; AVX1-NEXT: movb %r8b, -{{[0-9]+}}(%rsp)
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: movq %rax, %r8
+; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: movq %rax, %rdx
; AVX1-NEXT: movq %rax, %rsi
; AVX1-NEXT: movq %rax, %rdi
+; AVX1-NEXT: movl %eax, %ebp
; AVX1-NEXT: movl %eax, %ebx
-; AVX1-NEXT: movl %eax, %ecx
; AVX1-NEXT: vmovd %eax, %xmm1
; AVX1-NEXT: shrl $8, %eax
; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX1-NEXT: shrl $16, %ecx
-; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: shrl $24, %ebx
-; AVX1-NEXT: vpinsrb $3, %ebx, %xmm1, %xmm1
+; AVX1-NEXT: shrl $16, %ebx
+; AVX1-NEXT: vpinsrb $2, %ebx, %xmm1, %xmm1
+; AVX1-NEXT: shrl $24, %ebp
+; AVX1-NEXT: vpinsrb $3, %ebp, %xmm1, %xmm1
; AVX1-NEXT: shrq $32, %rdi
; AVX1-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
; AVX1-NEXT: shrq $40, %rsi
@@ -1153,8 +1155,8 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
; AVX1-NEXT: shrq $48, %rdx
; AVX1-NEXT: vpinsrb $6, %edx, %xmm1, %xmm1
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: shrq $56, %r8
-; AVX1-NEXT: vpinsrb $7, %r8d, %xmm1, %xmm0
+; AVX1-NEXT: shrq $56, %rcx
+; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0
; AVX1-NEXT: movl %eax, %ecx
; AVX1-NEXT: shrl $8, %ecx
; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
@@ -1222,92 +1224,98 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r12
+; AVX1-NEXT: popq %r13
; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
;
; AVX2-LABEL: _clearupper32xi8b:
; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: pushq %r15
; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: vpextrq $1, %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: movq %rcx, %r8
+; AVX2-NEXT: movq %rcx, %r9
+; AVX2-NEXT: movq %rcx, %r10
+; AVX2-NEXT: movq %rcx, %r11
+; AVX2-NEXT: movq %rcx, %r14
+; AVX2-NEXT: movq %rcx, %r15
; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX2-NEXT: movq %rdx, %r8
-; AVX2-NEXT: movq %rdx, %r9
-; AVX2-NEXT: movq %rdx, %r11
-; AVX2-NEXT: movq %rdx, %rsi
-; AVX2-NEXT: movq %rdx, %rdi
-; AVX2-NEXT: movq %rdx, %rcx
+; AVX2-NEXT: movq %rdx, %r12
+; AVX2-NEXT: movq %rdx, %r13
+; AVX2-NEXT: movq %rdx, %rbx
; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: movq %rdx, %rdi
+; AVX2-NEXT: movq %rdx, %rsi
+; AVX2-NEXT: movq %rdx, %rbp
; AVX2-NEXT: andb $15, %dl
; AVX2-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: shrq $56, %rax
-; AVX2-NEXT: andb $15, %al
-; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r14, %r10
-; AVX2-NEXT: shrq $48, %rcx
+; AVX2-NEXT: movq %rcx, %rdx
; AVX2-NEXT: andb $15, %cl
; AVX2-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r14, %rdx
-; AVX2-NEXT: shrq $40, %rdi
-; AVX2-NEXT: andb $15, %dil
-; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r14, %rax
-; AVX2-NEXT: shrq $32, %rsi
+; AVX2-NEXT: shrq $56, %rbp
+; AVX2-NEXT: andb $15, %bpl
+; AVX2-NEXT: movb %bpl, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: shrq $48, %rsi
; AVX2-NEXT: andb $15, %sil
; AVX2-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r14, %rcx
-; AVX2-NEXT: shrq $24, %r11
-; AVX2-NEXT: andb $15, %r11b
-; AVX2-NEXT: movb %r11b, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r14, %rsi
-; AVX2-NEXT: shrq $16, %r9
-; AVX2-NEXT: andb $15, %r9b
-; AVX2-NEXT: movb %r9b, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r14, %rdi
-; AVX2-NEXT: shrq $8, %r8
-; AVX2-NEXT: andb $15, %r8b
-; AVX2-NEXT: movb %r8b, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r14, %rbx
-; AVX2-NEXT: andb $15, %r14b
-; AVX2-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: shrq $8, %r10
-; AVX2-NEXT: shrq $16, %rdx
-; AVX2-NEXT: shrq $24, %rax
-; AVX2-NEXT: shrq $32, %rcx
-; AVX2-NEXT: shrq $40, %rsi
-; AVX2-NEXT: shrq $48, %rdi
-; AVX2-NEXT: shrq $56, %rbx
-; AVX2-NEXT: andb $15, %bl
-; AVX2-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: shrq $40, %rdi
; AVX2-NEXT: andb $15, %dil
; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: andb $15, %sil
-; AVX2-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: andb $15, %cl
-; AVX2-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: shrq $32, %rax
; AVX2-NEXT: andb $15, %al
; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: shrq $24, %rbx
+; AVX2-NEXT: andb $15, %bl
+; AVX2-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: shrq $16, %r13
+; AVX2-NEXT: andb $15, %r13b
+; AVX2-NEXT: movb %r13b, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: shrq $8, %r12
+; AVX2-NEXT: andb $15, %r12b
+; AVX2-NEXT: movb %r12b, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: shrq $8, %r8
+; AVX2-NEXT: shrq $16, %r9
+; AVX2-NEXT: shrq $24, %r10
+; AVX2-NEXT: shrq $32, %r11
+; AVX2-NEXT: shrq $40, %r14
+; AVX2-NEXT: shrq $48, %r15
+; AVX2-NEXT: shrq $56, %rdx
; AVX2-NEXT: andb $15, %dl
; AVX2-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: andb $15, %r15b
+; AVX2-NEXT: movb %r15b, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: andb $15, %r14b
+; AVX2-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: andb $15, %r11b
+; AVX2-NEXT: movb %r11b, -{{[0-9]+}}(%rsp)
; AVX2-NEXT: andb $15, %r10b
; AVX2-NEXT: movb %r10b, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: andb $15, %r9b
+; AVX2-NEXT: movb %r9b, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: andb $15, %r8b
+; AVX2-NEXT: movb %r8b, -{{[0-9]+}}(%rsp)
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: movq %rax, %r8
+; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: movq %rax, %rdx
; AVX2-NEXT: movq %rax, %rsi
; AVX2-NEXT: movq %rax, %rdi
+; AVX2-NEXT: movl %eax, %ebp
; AVX2-NEXT: movl %eax, %ebx
-; AVX2-NEXT: movl %eax, %ecx
; AVX2-NEXT: vmovd %eax, %xmm1
; AVX2-NEXT: shrl $8, %eax
; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX2-NEXT: shrl $16, %ecx
-; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: shrl $24, %ebx
-; AVX2-NEXT: vpinsrb $3, %ebx, %xmm1, %xmm1
+; AVX2-NEXT: shrl $16, %ebx
+; AVX2-NEXT: vpinsrb $2, %ebx, %xmm1, %xmm1
+; AVX2-NEXT: shrl $24, %ebp
+; AVX2-NEXT: vpinsrb $3, %ebp, %xmm1, %xmm1
; AVX2-NEXT: shrq $32, %rdi
; AVX2-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
; AVX2-NEXT: shrq $40, %rsi
@@ -1317,8 +1325,8 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
; AVX2-NEXT: shrq $48, %rdx
; AVX2-NEXT: vpinsrb $6, %edx, %xmm1, %xmm1
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: shrq $56, %r8
-; AVX2-NEXT: vpinsrb $7, %r8d, %xmm1, %xmm0
+; AVX2-NEXT: shrq $56, %rcx
+; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0
; AVX2-NEXT: movl %eax, %ecx
; AVX2-NEXT: shrl $8, %ecx
; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
@@ -1386,7 +1394,11 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
%x4 = bitcast <32 x i8> %0 to <64 x i4>
%r0 = insertelement <64 x i4> %x4, i4 zeroinitializer, i32 1
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll
index d901f16e5c73..fca39bca6c76 100644
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -1,34 +1,36 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-apple-darwin10 -disable-cgp-select2branch | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -disable-cgp-select2branch | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
define i32 @test1(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
-entry:
; CHECK-LABEL: test1:
-; CHECK: btl
-; CHECK-NEXT: movl $12, %eax
-; CHECK-NEXT: cmovael (%rcx), %eax
-; CHECK-NEXT: ret
-
- %0 = lshr i32 %x, %n ; <i32> [#uses=1]
- %1 = and i32 %0, 1 ; <i32> [#uses=1]
- %toBool = icmp eq i32 %1, 0 ; <i1> [#uses=1]
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: movl $12, %eax
+; CHECK-NEXT: cmovael (%rcx), %eax
+; CHECK-NEXT: retq
+entry:
+ %0 = lshr i32 %x, %n
+ %1 = and i32 %0, 1
+ %toBool = icmp eq i32 %1, 0
%v = load i32, i32* %vp
- %.0 = select i1 %toBool, i32 %v, i32 12 ; <i32> [#uses=1]
+ %.0 = select i1 %toBool, i32 %v, i32 12
ret i32 %.0
}
+
define i32 @test2(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
-entry:
; CHECK-LABEL: test2:
-; CHECK: btl
-; CHECK-NEXT: movl $12, %eax
-; CHECK-NEXT: cmovbl (%rcx), %eax
-; CHECK-NEXT: ret
-
- %0 = lshr i32 %x, %n ; <i32> [#uses=1]
- %1 = and i32 %0, 1 ; <i32> [#uses=1]
- %toBool = icmp eq i32 %1, 0 ; <i1> [#uses=1]
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: movl $12, %eax
+; CHECK-NEXT: cmovbl (%rcx), %eax
+; CHECK-NEXT: retq
+entry:
+ %0 = lshr i32 %x, %n
+ %1 = and i32 %0, 1
+ %toBool = icmp eq i32 %1, 0
%v = load i32, i32* %vp
- %.0 = select i1 %toBool, i32 12, i32 %v ; <i32> [#uses=1]
+ %.0 = select i1 %toBool, i32 12, i32 %v
ret i32 %.0
}
@@ -41,10 +43,13 @@ declare void @bar(i64) nounwind
define void @test3(i64 %a, i64 %b, i1 %p) nounwind {
; CHECK-LABEL: test3:
-; CHECK: cmov{{n?}}el %[[R1:e..]], %[[R2:e..]]
-; CHECK-NOT: movl
-; CHECK: call
-
+; CHECK: # BB#0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: testb $1, %dl
+; CHECK-NEXT: cmovel %esi, %edi
+; CHECK-NEXT: callq bar
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
%c = trunc i64 %a to i32
%d = trunc i64 %b to i32
%e = select i1 %p, i32 %c, i32 %d
@@ -65,52 +70,86 @@ define void @test3(i64 %a, i64 %b, i1 %p) nounwind {
; PR4814
-@g_3 = external global i8 ; <i8*> [#uses=1]
-@g_96 = external global i8 ; <i8*> [#uses=2]
-@g_100 = external global i8 ; <i8*> [#uses=2]
-@_2E_str = external constant [15 x i8], align 1 ; <[15 x i8]*> [#uses=1]
+@g_3 = external global i8
+@g_96 = external global i8
+@g_100 = external global i8
+@_2E_str = external constant [15 x i8], align 1
define i1 @test4() nounwind {
+; CHECK-LABEL: test4:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movsbl {{.*}}(%rip), %edx
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: shrb $7, %al
+; CHECK-NEXT: movzbl %al, %ecx
+; CHECK-NEXT: xorl $1, %ecx
+; CHECK-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; CHECK-NEXT: sarl %cl, %edx
+; CHECK-NEXT: movb {{.*}}(%rip), %al
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: je .LBB3_2
+; CHECK-NEXT: # BB#1: # %bb.i.i.i
+; CHECK-NEXT: movb {{.*}}(%rip), %cl
+; CHECK-NEXT: .LBB3_2: # %func_4.exit.i
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: testb %dl, %dl
+; CHECK-NEXT: setne %bl
+; CHECK-NEXT: movb %al, %cl
+; CHECK-NEXT: je .LBB3_4
+; CHECK-NEXT: # BB#3: # %func_4.exit.i
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: .LBB3_4: # %func_4.exit.i
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: je .LBB3_7
+; CHECK-NEXT: # BB#5: # %func_4.exit.i
+; CHECK-NEXT: testb %bl, %bl
+; CHECK-NEXT: jne .LBB3_7
+; CHECK-NEXT: # BB#6: # %bb.i.i
+; CHECK-NEXT: movb {{.*}}(%rip), %cl
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: movb %al, %cl
+; CHECK-NEXT: .LBB3_7: # %func_1.exit
+; CHECK-NEXT: movb %cl, {{.*}}(%rip)
+; CHECK-NEXT: movzbl %cl, %esi
+; CHECK-NEXT: movl $_2E_str, %edi
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: callq printf
+; CHECK-NEXT: movl %ebx, %eax
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: retq
entry:
- %0 = load i8, i8* @g_3, align 1 ; <i8> [#uses=2]
- %1 = sext i8 %0 to i32 ; <i32> [#uses=1]
- %.lobit.i = lshr i8 %0, 7 ; <i8> [#uses=1]
- %tmp.i = zext i8 %.lobit.i to i32 ; <i32> [#uses=1]
- %tmp.not.i = xor i32 %tmp.i, 1 ; <i32> [#uses=1]
- %iftmp.17.0.i.i = ashr i32 %1, %tmp.not.i ; <i32> [#uses=1]
- %retval56.i.i = trunc i32 %iftmp.17.0.i.i to i8 ; <i8> [#uses=1]
- %2 = icmp eq i8 %retval56.i.i, 0 ; <i1> [#uses=2]
- %g_96.promoted.i = load i8, i8* @g_96 ; <i8> [#uses=3]
- %3 = icmp eq i8 %g_96.promoted.i, 0 ; <i1> [#uses=2]
+ %0 = load i8, i8* @g_3, align 1
+ %1 = sext i8 %0 to i32
+ %.lobit.i = lshr i8 %0, 7
+ %tmp.i = zext i8 %.lobit.i to i32
+ %tmp.not.i = xor i32 %tmp.i, 1
+ %iftmp.17.0.i.i = ashr i32 %1, %tmp.not.i
+ %retval56.i.i = trunc i32 %iftmp.17.0.i.i to i8
+ %2 = icmp eq i8 %retval56.i.i, 0
+ %g_96.promoted.i = load i8, i8* @g_96
+ %3 = icmp eq i8 %g_96.promoted.i, 0
br i1 %3, label %func_4.exit.i, label %bb.i.i.i
-bb.i.i.i: ; preds = %entry
- %4 = load volatile i8, i8* @g_100, align 1 ; <i8> [#uses=0]
+bb.i.i.i:
+ %4 = load volatile i8, i8* @g_100, align 1
br label %func_4.exit.i
-; CHECK-LABEL: test4:
-; CHECK: g_100
-; CHECK: testb
-; CHECK-NOT: xor
-; CHECK: setne
-; CHECK: testb
-
-func_4.exit.i: ; preds = %bb.i.i.i, %entry
- %.not.i = xor i1 %2, true ; <i1> [#uses=1]
- %brmerge.i = or i1 %3, %.not.i ; <i1> [#uses=1]
- %.mux.i = select i1 %2, i8 %g_96.promoted.i, i8 0 ; <i8> [#uses=1]
+func_4.exit.i:
+ %.not.i = xor i1 %2, true
+ %brmerge.i = or i1 %3, %.not.i
+ %.mux.i = select i1 %2, i8 %g_96.promoted.i, i8 0
br i1 %brmerge.i, label %func_1.exit, label %bb.i.i
-bb.i.i: ; preds = %func_4.exit.i
- %5 = load volatile i8, i8* @g_100, align 1 ; <i8> [#uses=0]
+bb.i.i:
+ %5 = load volatile i8, i8* @g_100, align 1
br label %func_1.exit
-func_1.exit: ; preds = %bb.i.i, %func_4.exit.i
- %g_96.tmp.0.i = phi i8 [ %g_96.promoted.i, %bb.i.i ], [ %.mux.i, %func_4.exit.i ] ; <i8> [#uses=2]
+func_1.exit:
+ %g_96.tmp.0.i = phi i8 [ %g_96.promoted.i, %bb.i.i ], [ %.mux.i, %func_4.exit.i ]
%ret = phi i1 [ 0, %bb.i.i ], [ %.not.i, %func_4.exit.i ]
store i8 %g_96.tmp.0.i, i8* @g_96
- %6 = zext i8 %g_96.tmp.0.i to i32 ; <i32> [#uses=1]
- %7 = tail call i32 (i8*, ...) @printf(i8* noalias getelementptr ([15 x i8], [15 x i8]* @_2E_str, i64 0, i64 0), i32 %6) nounwind ; <i32> [#uses=0]
+ %6 = zext i8 %g_96.tmp.0.i to i32
+ %7 = tail call i32 (i8*, ...) @printf(i8* noalias getelementptr ([15 x i8], [15 x i8]* @_2E_str, i64 0, i64 0), i32 %6) nounwind
ret i1 %ret
}
@@ -120,29 +159,32 @@ declare i32 @printf(i8* nocapture, ...) nounwind
; Should compile to setcc | -2.
; rdar://6668608
define i32 @test5(i32* nocapture %P) nounwind readonly {
-entry:
; CHECK-LABEL: test5:
-; CHECK: xorl %eax, %eax
-; CHECK: setg %al
-; CHECK: orl $-2, %eax
-; CHECK: ret
-
- %0 = load i32, i32* %P, align 4 ; <i32> [#uses=1]
- %1 = icmp sgt i32 %0, 41 ; <i1> [#uses=1]
- %iftmp.0.0 = select i1 %1, i32 -1, i32 -2 ; <i32> [#uses=1]
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpl $41, (%rdi)
+; CHECK-NEXT: setg %al
+; CHECK-NEXT: orl $-2, %eax
+; CHECK-NEXT: retq
+entry:
+ %0 = load i32, i32* %P, align 4
+ %1 = icmp sgt i32 %0, 41
+ %iftmp.0.0 = select i1 %1, i32 -1, i32 -2
ret i32 %iftmp.0.0
}
define i32 @test6(i32* nocapture %P) nounwind readonly {
-entry:
; CHECK-LABEL: test6:
-; CHECK: xorl %eax, %eax
-; CHECK: setl %al
-; CHECK: leal 4(%rax,%rax,8), %eax
-; CHECK: ret
- %0 = load i32, i32* %P, align 4 ; <i32> [#uses=1]
- %1 = icmp sgt i32 %0, 41 ; <i1> [#uses=1]
- %iftmp.0.0 = select i1 %1, i32 4, i32 13 ; <i32> [#uses=1]
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpl $42, (%rdi)
+; CHECK-NEXT: setl %al
+; CHECK-NEXT: leal 4(%rax,%rax,8), %eax
+; CHECK-NEXT: retq
+entry:
+ %0 = load i32, i32* %P, align 4
+ %1 = icmp sgt i32 %0, 41
+ %iftmp.0.0 = select i1 %1, i32 4, i32 13
ret i32 %iftmp.0.0
}
@@ -151,16 +193,21 @@ entry:
; because it isn't worth it. Just use a branch instead.
define i8 @test7(i1 inreg %c, i8 inreg %a, i8 inreg %b) nounwind {
; CHECK-LABEL: test7:
-; CHECK: testb $1, %dil
-; CHECK-NEXT: jne LBB
-
+; CHECK: # BB#0:
+; CHECK-NEXT: testb $1, %dil
+; CHECK-NEXT: jne .LBB6_2
+; CHECK-NEXT: # BB#1:
+; CHECK-NEXT: movb %dl, %sil
+; CHECK-NEXT: .LBB6_2:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
%d = select i1 %c, i8 %a, i8 %b
ret i8 %d
}
define i32 @smin(i32 %x) {
; CHECK-LABEL: smin:
-; CHECK: ## BB#0:
+; CHECK: # BB#0:
; CHECK-NEXT: xorl $-1, %edi
; CHECK-NEXT: movl $-1, %eax
; CHECK-NEXT: cmovsl %edi, %eax
diff --git a/test/CodeGen/X86/code_placement_cold_loop_blocks.ll b/test/CodeGen/X86/code_placement_cold_loop_blocks.ll
index d7dc8defac3a..875d791dc802 100644
--- a/test/CodeGen/X86/code_placement_cold_loop_blocks.ll
+++ b/test/CodeGen/X86/code_placement_cold_loop_blocks.ll
@@ -37,7 +37,7 @@ end:
ret void
}
-define void @nested_loop_0() !prof !1 {
+define void @nested_loop_0(i1 %flag) !prof !1 {
; Test if a block that is cold in the inner loop but not cold in the outer loop
; will merged to the outer loop chain.
;
@@ -68,8 +68,7 @@ if.then:
if.else:
call void @e()
- %call2 = call zeroext i1 @a()
- br i1 %call2, label %header2, label %header, !prof !3
+ br i1 %flag, label %header2, label %header, !prof !3
end:
call void @f()
diff --git a/test/CodeGen/X86/combine-avx-intrinsics.ll b/test/CodeGen/X86/combine-avx-intrinsics.ll
index 64e081523c1f..811b1f20833c 100644
--- a/test/CodeGen/X86/combine-avx-intrinsics.ll
+++ b/test/CodeGen/X86/combine-avx-intrinsics.ll
@@ -1,59 +1,56 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s
define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0) {
+; CHECK-LABEL: test_x86_avx_blend_pd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a0, i32 7)
ret <4 x double> %1
}
-; CHECK-LABEL: test_x86_avx_blend_pd_256
-; CHECK-NOT: vblendpd
-; CHECK: ret
-
define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0) {
+; CHECK-LABEL: test_x86_avx_blend_ps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a0, i32 7)
ret <8 x float> %1
}
-; CHECK-LABEL: test_x86_avx_blend_ps_256
-; CHECK-NOT: vblendps
-; CHECK: ret
-
define <4 x double> @test2_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: test2_x86_avx_blend_pd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 0)
ret <4 x double> %1
}
-; CHECK-LABEL: test2_x86_avx_blend_pd_256
-; CHECK-NOT: vblendpd
-; CHECK: ret
-
define <8 x float> @test2_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
+; CHECK-LABEL: test2_x86_avx_blend_ps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 0)
ret <8 x float> %1
}
-; CHECK-LABEL: test2_x86_avx_blend_ps_256
-; CHECK-NOT: vblendps
-; CHECK: ret
-
define <4 x double> @test3_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: test3_x86_avx_blend_pd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
%1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 -1)
ret <4 x double> %1
}
-; CHECK-LABEL: test3_x86_avx_blend_pd_256
-; CHECK-NOT: vblendpd
-; CHECK: ret
-
define <8 x float> @test3_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
+; CHECK-LABEL: test3_x86_avx_blend_ps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
%1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 -1)
ret <8 x float> %1
}
-; CHECK-LABEL: test3_x86_avx_blend_ps_256
-; CHECK-NOT: vblendps
-; CHECK: ret
-
declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32)
declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32)
diff --git a/test/CodeGen/X86/combine-avx2-intrinsics.ll b/test/CodeGen/X86/combine-avx2-intrinsics.ll
index 2714b26c9141..9a548f6b7f0e 100644
--- a/test/CodeGen/X86/combine-avx2-intrinsics.ll
+++ b/test/CodeGen/X86/combine-avx2-intrinsics.ll
@@ -1,88 +1,83 @@
-; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s
; Verify that the backend correctly combines AVX2 builtin intrinsics.
define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0) {
+; CHECK-LABEL: test_x86_avx2_pblendw:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a0, i32 7)
ret <16 x i16> %res
}
-; CHECK-LABEL: test_x86_avx2_pblendw
-; CHECK-NOT: vpblendw
-; CHECK: ret
-
define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0) {
+; CHECK-LABEL: test_x86_avx2_pblendd_128:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a0, i32 7)
ret <4 x i32> %res
}
-; CHECK-LABEL: test_x86_avx2_pblendd_128
-; CHECK-NOT: vpblendd
-; CHECK: ret
-
define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0) {
+; CHECK-LABEL: test_x86_avx2_pblendd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a0, i32 7)
ret <8 x i32> %res
}
-; CHECK-LABEL: test_x86_avx2_pblendd_256
-; CHECK-NOT: vpblendd
-; CHECK: ret
-
define <16 x i16> @test2_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: test2_x86_avx2_pblendw:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 0)
ret <16 x i16> %res
}
-; CHECK-LABEL: test2_x86_avx2_pblendw
-; CHECK-NOT: vpblendw
-; CHECK: ret
-
define <4 x i32> @test2_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test2_x86_avx2_pblendd_128:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 0)
ret <4 x i32> %res
}
-; CHECK-LABEL: test2_x86_avx2_pblendd_128
-; CHECK-NOT: vpblendd
-; CHECK: ret
-
define <8 x i32> @test2_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: test2_x86_avx2_pblendd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 0)
ret <8 x i32> %res
}
-; CHECK-LABEL: test2_x86_avx2_pblendd_256
-; CHECK-NOT: vpblendd
-; CHECK: ret
-
define <16 x i16> @test3_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: test3_x86_avx2_pblendw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
%res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 -1)
ret <16 x i16> %res
}
-; CHECK-LABEL: test3_x86_avx2_pblendw
-; CHECK-NOT: vpblendw
-; CHECK: ret
-
define <4 x i32> @test3_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test3_x86_avx2_pblendd_128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 -1)
ret <4 x i32> %res
}
-; CHECK-LABEL: test3_x86_avx2_pblendd_128
-; CHECK-NOT: vpblendd
-; CHECK: ret
-
define <8 x i32> @test3_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: test3_x86_avx2_pblendd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
%res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 -1)
ret <8 x i32> %res
}
-; CHECK-LABEL: test3_x86_avx2_pblendd_256
-; CHECK-NOT: vpblendd
-; CHECK: ret
-
declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32)
declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32)
diff --git a/test/CodeGen/X86/combine-rotates.ll b/test/CodeGen/X86/combine-rotates.ll
new file mode 100644
index 000000000000..713ee5d0f65a
--- /dev/null
+++ b/test/CodeGen/X86/combine-rotates.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefix=XOP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512
+
+; fold (rot (rot x, c1), c2) -> rot x, c1+c2
+define <4 x i32> @combine_vec_rot_rot(<4 x i32> %x) {
+; XOP-LABEL: combine_vec_rot_rot:
+; XOP: # BB#0:
+; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1
+; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1
+; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT: retq
+;
+; AVX512-LABEL: combine_vec_rot_rot:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %1 = lshr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4>
+ %2 = shl <4 x i32> %x, <i32 31, i32 30, i32 29, i32 28>
+ %3 = or <4 x i32> %1, %2
+ %4 = lshr <4 x i32> %3, <i32 12, i32 13, i32 14, i32 15>
+ %5 = shl <4 x i32> %3, <i32 20, i32 19, i32 18, i32 17>
+ %6 = or <4 x i32> %4, %5
+ ret <4 x i32> %6
+}
+
+define <4 x i32> @combine_vec_rot_rot_splat(<4 x i32> %x) {
+; XOP-LABEL: combine_vec_rot_rot_splat:
+; XOP: # BB#0:
+; XOP-NEXT: vprotd $7, %xmm0, %xmm0
+; XOP-NEXT: retq
+;
+; AVX512-LABEL: combine_vec_rot_rot_splat:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsrld $3, %xmm0, %xmm1
+; AVX512-NEXT: vpslld $29, %xmm0, %xmm0
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpsrld $22, %xmm0, %xmm1
+; AVX512-NEXT: vpslld $10, %xmm0, %xmm0
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
+ %2 = shl <4 x i32> %x, <i32 29, i32 29, i32 29, i32 29>
+ %3 = or <4 x i32> %1, %2
+ %4 = lshr <4 x i32> %3, <i32 22, i32 22, i32 22, i32 22>
+ %5 = shl <4 x i32> %3, <i32 10, i32 10, i32 10, i32 10>
+ %6 = or <4 x i32> %4, %5
+ ret <4 x i32> %6
+}
+
+define <4 x i32> @combine_vec_rot_rot_splat_zero(<4 x i32> %x) {
+; XOP-LABEL: combine_vec_rot_rot_splat_zero:
+; XOP: # BB#0:
+; XOP-NEXT: retq
+;
+; AVX512-LABEL: combine_vec_rot_rot_splat_zero:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsrld $1, %xmm0, %xmm1
+; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpsrld $31, %xmm0, %xmm1
+; AVX512-NEXT: vpaddd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+ %2 = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
+ %3 = or <4 x i32> %1, %2
+ %4 = lshr <4 x i32> %3, <i32 31, i32 31, i32 31, i32 31>
+ %5 = shl <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
+ %6 = or <4 x i32> %4, %5
+ ret <4 x i32> %6
+}
diff --git a/test/CodeGen/X86/combine-sse41-intrinsics.ll b/test/CodeGen/X86/combine-sse41-intrinsics.ll
index 1916883c201b..0c8e7b317ec6 100644
--- a/test/CodeGen/X86/combine-sse41-intrinsics.ll
+++ b/test/CodeGen/X86/combine-sse41-intrinsics.ll
@@ -1,89 +1,81 @@
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=corei7 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s
define <2 x double> @test_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_sse41_blend_pd:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 0)
ret <2 x double> %1
}
-; CHECK-LABEL: test_x86_sse41_blend_pd
-; CHECK-NOT: blendpd
-; CHECK: ret
-
define <4 x float> @test_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) {
+; CHECK-LABEL: test_x86_sse41_blend_ps:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 0)
ret <4 x float> %1
}
-; CHECK-LABEL: test_x86_sse41_blend_ps
-; CHECK-NOT: blendps
-; CHECK: ret
-
define <8 x i16> @test_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_x86_sse41_pblend_w:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 0)
ret <8 x i16> %1
}
-; CHECK-LABEL: test_x86_sse41_pblend_w
-; CHECK-NOT: pblendw
-; CHECK: ret
-
define <2 x double> @test2_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test2_x86_sse41_blend_pd:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 -1)
ret <2 x double> %1
}
-; CHECK-LABEL: test2_x86_sse41_blend_pd
-; CHECK-NOT: blendpd
-; CHECK: movaps %xmm1, %xmm0
-; CHECK-NEXT: ret
-
define <4 x float> @test2_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) {
+; CHECK-LABEL: test2_x86_sse41_blend_ps:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 -1)
ret <4 x float> %1
}
-; CHECK-LABEL: test2_x86_sse41_blend_ps
-; CHECK-NOT: blendps
-; CHECK: movaps %xmm1, %xmm0
-; CHECK-NEXT: ret
-
define <8 x i16> @test2_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test2_x86_sse41_pblend_w:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 -1)
ret <8 x i16> %1
}
-; CHECK-LABEL: test2_x86_sse41_pblend_w
-; CHECK-NOT: pblendw
-; CHECK: movaps %xmm1, %xmm0
-; CHECK-NEXT: ret
-
define <2 x double> @test3_x86_sse41_blend_pd(<2 x double> %a0) {
+; CHECK-LABEL: test3_x86_sse41_blend_pd:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a0, i32 7)
ret <2 x double> %1
}
-; CHECK-LABEL: test3_x86_sse41_blend_pd
-; CHECK-NOT: blendpd
-; CHECK: ret
-
define <4 x float> @test3_x86_sse41_blend_ps(<4 x float> %a0) {
+; CHECK-LABEL: test3_x86_sse41_blend_ps:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a0, i32 7)
ret <4 x float> %1
}
-; CHECK-LABEL: test3_x86_sse41_blend_ps
-; CHECK-NOT: blendps
-; CHECK: ret
-
define <8 x i16> @test3_x86_sse41_pblend_w(<8 x i16> %a0) {
+; CHECK-LABEL: test3_x86_sse41_pblend_w:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a0, i32 7)
ret <8 x i16> %1
}
-; CHECK-LABEL: test3_x86_sse41_pblend_w
-; CHECK-NOT: pblendw
-; CHECK: ret
-
declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32)
declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32)
diff --git a/test/CodeGen/X86/constant-hoisting-bfi.ll b/test/CodeGen/X86/constant-hoisting-bfi.ll
index 83589b7706f7..d73f7163fd87 100644
--- a/test/CodeGen/X86/constant-hoisting-bfi.ll
+++ b/test/CodeGen/X86/constant-hoisting-bfi.ll
@@ -4,13 +4,13 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
; Check when BFI is enabled for constant hoisting, constant 214748364701
; will not be hoisted to the func entry.
-; CHECK-LABEL: @foo(
+; CHECK-LABEL: @test1(
; CHECK: entry:
; CHECK-NOT: bitcast i64 214748364701 to i64
; CHECK: if.then:
; Function Attrs: norecurse nounwind uwtable
-define i64 @foo(i64* nocapture %a) {
+define i64 @test1(i64* nocapture %a) {
entry:
%arrayidx = getelementptr inbounds i64, i64* %a, i64 9
%t0 = load i64, i64* %arrayidx, align 8
@@ -52,7 +52,7 @@ return: ; preds = %if.else5, %if.then,
; in while.body will be hoisted to while.body.preheader. 214748364701 in
; if.then16 and if.else10 will be merged and hoisted to the beginning of
; if.else10 because if.else10 dominates if.then16.
-; CHECK-LABEL: @goo(
+; CHECK-LABEL: @test2(
; CHECK: entry:
; CHECK-NOT: bitcast i64 214748364701 to i64
; CHECK: while.body.preheader:
@@ -61,7 +61,7 @@ return: ; preds = %if.else5, %if.then,
; CHECK: if.else10:
; CHECK-NEXT: bitcast i64 214748364701 to i64
; CHECK-NOT: bitcast i64 214748364701 to i64
-define i64 @goo(i64* nocapture %a) {
+define i64 @test2(i64* nocapture %a) {
entry:
%arrayidx = getelementptr inbounds i64, i64* %a, i64 9
%t0 = load i64, i64* %arrayidx, align 8
@@ -113,3 +113,47 @@ return: ; preds = %while.cond.preheade
}
!0 = !{!"branch_weights", i32 1, i32 2000}
+
+; 214748364701 will be hoisted to entry block to reduce code size.
+; CHECK-LABEL: @test3(
+; CHECK: entry:
+; CHECK-NEXT: %const = bitcast i64 214748364701 to i64
+define i64 @test3(i64 %t0) {
+entry:
+ %cmp = icmp ult i64 %t0, 56
+ br i1 %cmp, label %if.then, label %if.else
+
+; CHECK: if.then:
+; CHECK-NOT: %const = bitcast i64 214748364701 to i64
+if.then:
+ %add1 = add i64 %t0, 214748364701
+ br label %return
+
+; CHECK: if.else:
+; CHECK-NOT: %const = bitcast i64 214748364701 to i64
+if.else:
+ %add2 = add i64 %t0, 214748364701
+ br label %return
+
+return:
+ %retval = phi i64 [ %add1, %if.then ], [ %add2, %if.else ]
+ ret i64 %retval
+}
+
+; 214748364701 will not be hoisted to entry block because it will only
+; increase its live range.
+; CHECK-LABEL: @test4(
+; CHECK: nextblock:
+; CHECK-NEXT: %add1 = add i64 %t0, 214748364701
+define i64 @test4(i64 %t0) {
+entry:
+ %cmp = icmp ult i64 %t0, 56
+ br label %nextblock
+
+nextblock:
+ %add1 = add i64 %t0, 214748364701
+ br label %return
+
+return:
+ ret i64 %add1
+}
diff --git a/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll b/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll
index 9dd184c8ab31..88778b317b97 100644
--- a/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll
+++ b/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll
@@ -62,4 +62,128 @@ define void @test_memcpy_args(i8** %Storage) {
call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %Dst, i8* align 4 %Src, i32 4, i32 4) ret void
}
+define i8* @test_memmove1(i8* %P, i8* %Q) {
+ ; CHECK: test_memmove
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 1, i32 1)
+ ret i8* %P
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $1, %edx
+ ; CHECK: __llvm_memmove_element_unordered_atomic_1
+}
+
+define i8* @test_memmove2(i8* %P, i8* %Q) {
+ ; CHECK: test_memmove2
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 2, i32 2)
+ ret i8* %P
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $2, %edx
+ ; CHECK: __llvm_memmove_element_unordered_atomic_2
+}
+
+define i8* @test_memmove4(i8* %P, i8* %Q) {
+ ; CHECK: test_memmove4
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 4, i32 4)
+ ret i8* %P
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $4, %edx
+ ; CHECK: __llvm_memmove_element_unordered_atomic_4
+}
+
+define i8* @test_memmove8(i8* %P, i8* %Q) {
+ ; CHECK: test_memmove8
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %P, i8* align 8 %Q, i32 8, i32 8)
+ ret i8* %P
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $8, %edx
+ ; CHECK: __llvm_memmove_element_unordered_atomic_8
+}
+
+define i8* @test_memmove16(i8* %P, i8* %Q) {
+ ; CHECK: test_memmove16
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %P, i8* align 16 %Q, i32 16, i32 16)
+ ret i8* %P
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $16, %edx
+ ; CHECK: __llvm_memmove_element_unordered_atomic_16
+}
+
+define void @test_memmove_args(i8** %Storage) {
+ ; CHECK: test_memmove_args
+ %Dst = load i8*, i8** %Storage
+ %Src.addr = getelementptr i8*, i8** %Storage, i64 1
+ %Src = load i8*, i8** %Src.addr
+
+ ; 1st arg (%rdi)
+ ; CHECK-DAG: movq (%rdi), [[REG1:%r.+]]
+ ; CHECK-DAG: movq [[REG1]], %rdi
+ ; 2nd arg (%rsi)
+ ; CHECK-DAG: movq 8(%rdi), %rsi
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $4, %edx
+ ; CHECK: __llvm_memmove_element_unordered_atomic_4
+ call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %Dst, i8* align 4 %Src, i32 4, i32 4) ret void
+}
+
+define i8* @test_memset1(i8* %P, i8 %V) {
+ ; CHECK: test_memset
+ call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 4 %P, i8 %V, i32 1, i32 1)
+ ret i8* %P
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $1, %edx
+ ; CHECK: __llvm_memset_element_unordered_atomic_1
+}
+
+define i8* @test_memset2(i8* %P, i8 %V) {
+ ; CHECK: test_memset2
+ call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 4 %P, i8 %V, i32 2, i32 2)
+ ret i8* %P
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $2, %edx
+ ; CHECK: __llvm_memset_element_unordered_atomic_2
+}
+
+define i8* @test_memset4(i8* %P, i8 %V) {
+ ; CHECK: test_memset4
+ call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 4 %P, i8 %V, i32 4, i32 4)
+ ret i8* %P
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $4, %edx
+ ; CHECK: __llvm_memset_element_unordered_atomic_4
+}
+
+define i8* @test_memset8(i8* %P, i8 %V) {
+ ; CHECK: test_memset8
+ call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 8 %P, i8 %V, i32 8, i32 8)
+ ret i8* %P
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $8, %edx
+ ; CHECK: __llvm_memset_element_unordered_atomic_8
+}
+
+define i8* @test_memset16(i8* %P, i8 %V) {
+ ; CHECK: test_memset16
+ call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 16 %P, i8 %V, i32 16, i32 16)
+ ret i8* %P
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $16, %edx
+ ; CHECK: __llvm_memset_element_unordered_atomic_16
+}
+
+define void @test_memset_args(i8** %Storage, i8* %V) {
+ ; CHECK: test_memset_args
+ %Dst = load i8*, i8** %Storage
+ %Val = load i8, i8* %V
+
+ ; 1st arg (%rdi)
+ ; CHECK-DAG: movq (%rdi), %rdi
+ ; 2nd arg (%rsi)
+ ; CHECK-DAG: movzbl (%rsi), %esi
+ ; 3rd arg (%edx) -- length
+ ; CHECK-DAG: movl $4, %edx
+ ; CHECK: __llvm_memset_element_unordered_atomic_4
+ call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 4 %Dst, i8 %Val, i32 4, i32 4) ret void
+}
+
declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind
+declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind
+declare void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* nocapture, i8, i32, i32) nounwind
diff --git a/test/CodeGen/X86/extract-store.ll b/test/CodeGen/X86/extract-store.ll
index 48cb8d70b974..4ea6b7801fb3 100644
--- a/test/CodeGen/X86/extract-store.ll
+++ b/test/CodeGen/X86/extract-store.ll
@@ -345,7 +345,7 @@ define void @extract_i64_1(i64* nocapture %dst, <2 x i64> %foo) nounwind {
; SSE-X32-LABEL: extract_i64_1:
; SSE-X32: # BB#0:
; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SSE-X32-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-X32-NEXT: movq %xmm0, (%eax)
; SSE-X32-NEXT: retl
;
diff --git a/test/CodeGen/X86/extractelement-legalization-store-ordering.ll b/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
index 5d5cbc76f92e..4d0b5ccc16b0 100644
--- a/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
+++ b/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple i386-apple-darwin -mcpu=yonah | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=yonah | FileCheck %s
target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
@@ -6,31 +7,31 @@ target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
; into loads, off the stack or a previous store.
; Be very explicit about the ordering/stack offsets.
-; CHECK-LABEL: test_extractelement_legalization_storereuse:
-; CHECK: # BB#0
-; CHECK-NEXT: pushl %ebx
-; CHECK-NEXT: pushl %edi
-; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: movl 16(%esp), %eax
-; CHECK-NEXT: movl 24(%esp), %ecx
-; CHECK-NEXT: movl 20(%esp), %edx
-; CHECK-NEXT: paddd (%edx), %xmm0
-; CHECK-NEXT: movdqa %xmm0, (%edx)
-; CHECK-NEXT: movl (%edx), %esi
-; CHECK-NEXT: movl 4(%edx), %edi
-; CHECK-NEXT: shll $4, %ecx
-; CHECK-NEXT: movl 8(%edx), %ebx
-; CHECK-NEXT: movl 12(%edx), %edx
-; CHECK-NEXT: movl %esi, 12(%eax,%ecx)
-; CHECK-NEXT: movl %edi, (%eax,%ecx)
-; CHECK-NEXT: movl %ebx, 8(%eax,%ecx)
-; CHECK-NEXT: movl %edx, 4(%eax,%ecx)
-; CHECK-NEXT: popl %esi
-; CHECK-NEXT: popl %edi
-; CHECK-NEXT: popl %ebx
-; CHECK-NEXT: retl
-
define void @test_extractelement_legalization_storereuse(<4 x i32> %a, i32* nocapture %x, i32* nocapture readonly %y, i32 %i) #0 {
+; CHECK-LABEL: test_extractelement_legalization_storereuse:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: pushl %ebx
+; CHECK-NEXT: pushl %edi
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: paddd (%ecx), %xmm0
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT: movdqa %xmm0, (%ecx)
+; CHECK-NEXT: movl (%ecx), %esi
+; CHECK-NEXT: movl 4(%ecx), %edi
+; CHECK-NEXT: shll $4, %edx
+; CHECK-NEXT: movl 8(%ecx), %ebx
+; CHECK-NEXT: movl 12(%ecx), %ecx
+; CHECK-NEXT: movl %esi, 12(%eax,%edx)
+; CHECK-NEXT: movl %edi, (%eax,%edx)
+; CHECK-NEXT: movl %ebx, 8(%eax,%edx)
+; CHECK-NEXT: movl %ecx, 4(%eax,%edx)
+; CHECK-NEXT: popl %esi
+; CHECK-NEXT: popl %edi
+; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: retl
+; CHECK-NEXT: ## -- End function
entry:
%0 = bitcast i32* %y to <4 x i32>*
%1 = load <4 x i32>, <4 x i32>* %0, align 16
diff --git a/test/CodeGen/X86/fast-isel-abort-warm.ll b/test/CodeGen/X86/fast-isel-abort-warm.ll
index 3caa91b11ec6..e87d14bb28ad 100644
--- a/test/CodeGen/X86/fast-isel-abort-warm.ll
+++ b/test/CodeGen/X86/fast-isel-abort-warm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -fast-isel -o - %s -fast-isel-report-on-fallback 2>&1 | FileCheck %s
+; RUN: llc -fast-isel -o - %s -fast-isel-report-on-fallback -pass-remarks-missed=isel 2>&1 | FileCheck %s
; Make sure FastISel report a warming when we asked it to do so.
; Note: This test needs to use whatever is not supported by FastISel.
; Thus, this test may fail because inline asm gets supported in FastISel.
@@ -6,9 +6,26 @@
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx"
+; CHECK: remark: <unknown>:0:0: FastISel missed call: call void asm sideeffect
; CHECK: warning: Instruction selection used fallback path for foo
define void @foo(){
entry:
call void asm sideeffect "nop", "~{dirflag},~{fpsr},~{flags}"()
ret void
}
+
+; CHECK: remark: <unknown>:0:0: FastISel missed: store i128
+; CHECK: warning: Instruction selection used fallback path for test_instruction_fallback
+define void @test_instruction_fallback(i128* %ptr){
+ %v1 = load i128, i128* %ptr
+ %result = add i128 %v1, %v1
+ store i128 %result, i128 * %ptr
+ ret void
+}
+
+; CHECK-NOT: remark: <unknown>:0:0: FastISel missed
+; CHECK-NOT: warning: Instruction selection used fallback path for test_instruction_not_fallback
+define i32 @test_instruction_not_fallback(i32 %a){
+ %result = add i32 %a, %a
+ ret i32 %result
+}
diff --git a/test/CodeGen/X86/fast-isel-gc-intrinsics.ll b/test/CodeGen/X86/fast-isel-gc-intrinsics.ll
new file mode 100644
index 000000000000..bf08ad01d7d8
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-gc-intrinsics.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -fast-isel
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+; Dont crash with gc intrinsics.
+
+; gcrelocate call should not be an LLVM Machine Block by itself.
+define i8 addrspace(1)* @test_gcrelocate(i8 addrspace(1)* %v) gc "statepoint-example" {
+entry:
+ %tok = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %v)
+ %vnew = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %tok, i32 7, i32 7)
+ ret i8 addrspace(1)* %vnew
+}
+
+; gcresult calls are fine in their own blocks.
+define i1 @test_gcresult() gc "statepoint-example" {
+entry:
+ %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0)
+ %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token)
+ ret i1 %call1
+}
+
+; we are okay here because we see the gcrelocate and avoid generating their own
+; block.
+define i1 @test_gcresult_gcrelocate(i8 addrspace(1)* %v) gc "statepoint-example" {
+entry:
+ %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %v)
+ %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token)
+ %vnew = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %safepoint_token, i32 7, i32 7)
+ ret i1 %call1
+}
+
+define i8 addrspace(1)* @test_non_entry_block(i8 addrspace(1)* %v, i8 %val) gc "statepoint-example" {
+entry:
+ %load = load i8, i8 addrspace(1)* %v
+ %cmp = icmp eq i8 %load, %val
+ br i1 %cmp, label %func_call, label %exit
+
+func_call:
+ call void @dummy()
+ %tok = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %v)
+ %vnew = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %tok, i32 7, i32 7)
+ ret i8 addrspace(1)* %vnew
+
+exit:
+ ret i8 addrspace(1)* %v
+
+}
+
+declare void @dummy()
+declare void @foo()
+
+declare zeroext i1 @return_i1()
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...)
+declare i1 @llvm.experimental.gc.result.i1(token)
+declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token, i32, i32)
diff --git a/test/CodeGen/X86/fastisel-softfloat.ll b/test/CodeGen/X86/fastisel-softfloat.ll
new file mode 100644
index 000000000000..e4330db81e1a
--- /dev/null
+++ b/test/CodeGen/X86/fastisel-softfloat.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -o - | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define float @pr26522(float %pat) #0 {
+; CHECK-LABEL: pr26522:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ ret float %pat
+}
+
+attributes #0 = { noinline optnone "target-features"="+soft-float" }
diff --git a/test/CodeGen/X86/fp128-i128.ll b/test/CodeGen/X86/fp128-i128.ll
index 6c6bc8bdc1d1..98082ec611d4 100644
--- a/test/CodeGen/X86/fp128-i128.ll
+++ b/test/CodeGen/X86/fp128-i128.ll
@@ -50,8 +50,8 @@ define void @TestUnionLD1(fp128 %s, i64 %n) #0 {
; CHECK-NEXT: andq %rdi, %rcx
; CHECK-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000
; CHECK-NEXT: andq -{{[0-9]+}}(%rsp), %rdx
-; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: jmp foo # TAILCALL
diff --git a/test/CodeGen/X86/gather-addresses.ll b/test/CodeGen/X86/gather-addresses.ll
index c3109673468e..e09ad3e4e0b8 100644
--- a/test/CodeGen/X86/gather-addresses.ll
+++ b/test/CodeGen/X86/gather-addresses.ll
@@ -16,10 +16,10 @@
; LIN: sarq $32, %r[[REG2]]
; LIN: movslq %e[[REG4]], %r[[REG3:.+]]
; LIN: sarq $32, %r[[REG4]]
-; LIN: movsd (%rdi,%r[[REG1]],8), %xmm0
-; LIN: movhpd (%rdi,%r[[REG2]],8), %xmm0
-; LIN: movsd (%rdi,%r[[REG3]],8), %xmm1
-; LIN: movhpd (%rdi,%r[[REG4]],8), %xmm1
+; LIN: movsd (%rdi,%r[[REG3]],8), %xmm1
+; LIN: movhpd (%rdi,%r[[REG4]],8), %xmm1
+; LIN: movq %rdi, %xmm1
+; LIN: movq %r[[REG3]], %xmm0
; WIN: movdqa (%rdx), %xmm0
; WIN: pand (%r8), %xmm0
@@ -29,10 +29,10 @@
; WIN: sarq $32, %r[[REG2]]
; WIN: movslq %e[[REG4]], %r[[REG3:.+]]
; WIN: sarq $32, %r[[REG4]]
-; WIN: movsd (%rcx,%r[[REG1]],8), %xmm0
-; WIN: movhpd (%rcx,%r[[REG2]],8), %xmm0
-; WIN: movsd (%rcx,%r[[REG3]],8), %xmm1
-; WIN: movhpd (%rcx,%r[[REG4]],8), %xmm1
+; WIN: movsd (%rcx,%r[[REG3]],8), %xmm1
+; WIN: movhpd (%rcx,%r[[REG4]],8), %xmm1
+; WIN: movdqa (%r[[REG2]]), %xmm0
+; WIN: movq %r[[REG2]], %xmm1
define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
%a = load <4 x i32>, <4 x i32>* %i
diff --git a/test/CodeGen/X86/half.ll b/test/CodeGen/X86/half.ll
index 4c8003f0c516..b7c43d3b2e3e 100644
--- a/test/CodeGen/X86/half.ll
+++ b/test/CodeGen/X86/half.ll
@@ -1,266 +1,833 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c -asm-verbose=false -fixup-byte-word-insts=1 \
-; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL -check-prefix=BWON
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c -asm-verbose=false -fixup-byte-word-insts=0 \
-; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL -check-prefix=BWOFF
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c -asm-verbose=false -fixup-byte-word-insts=1 \
-; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-F16C -check-prefix=BWON
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr +sse2 -asm-verbose=false -fixup-byte-word-insts=0 \
-; RUN: | FileCheck %s -check-prefix=CHECK-I686
-
-define void @test_load_store(half* %in, half* %out) {
-; CHECK-LABEL: test_load_store:
-; BWON: movzwl (%rdi), %eax
-; BWOFF: movw (%rdi), %ax
-; CHECK: movw %ax, (%rsi)
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-f16c -fixup-byte-word-insts=1 \
+; RUN: | FileCheck %s -check-prefixes=CHECK,CHECK-LIBCALL,BWON,BWON-NOF16C
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-f16c -fixup-byte-word-insts=0 \
+; RUN: | FileCheck %s -check-prefixes=CHECK,CHECK-LIBCALL,BWOFF
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+f16c -fixup-byte-word-insts=1 \
+; RUN: | FileCheck %s -check-prefixes=CHECK,BWON,BWON-F16C
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr +sse2 -fixup-byte-word-insts=0 \
+; RUN: | FileCheck %s -check-prefixes=CHECK-I686
+
+define void @test_load_store(half* %in, half* %out) #0 {
+; BWON-LABEL: test_load_store:
+; BWON: # BB#0:
+; BWON-NEXT: movzwl (%rdi), %eax
+; BWON-NEXT: movw %ax, (%rsi)
+; BWON-NEXT: retq
+;
+; BWOFF-LABEL: test_load_store:
+; BWOFF: # BB#0:
+; BWOFF-NEXT: movw (%rdi), %ax
+; BWOFF-NEXT: movw %ax, (%rsi)
+; BWOFF-NEXT: retq
+;
+; CHECK-I686-LABEL: test_load_store:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-I686-NEXT: movw (%ecx), %cx
+; CHECK-I686-NEXT: movw %cx, (%eax)
+; CHECK-I686-NEXT: retl
%val = load half, half* %in
store half %val, half* %out
ret void
}
-define i16 @test_bitcast_from_half(half* %addr) {
-; CHECK-LABEL: test_bitcast_from_half:
-; BWON: movzwl (%rdi), %eax
-; BWOFF: movw (%rdi), %ax
+define i16 @test_bitcast_from_half(half* %addr) #0 {
+; BWON-LABEL: test_bitcast_from_half:
+; BWON: # BB#0:
+; BWON-NEXT: movzwl (%rdi), %eax
+; BWON-NEXT: retq
+;
+; BWOFF-LABEL: test_bitcast_from_half:
+; BWOFF: # BB#0:
+; BWOFF-NEXT: movw (%rdi), %ax
+; BWOFF-NEXT: retq
+;
+; CHECK-I686-LABEL: test_bitcast_from_half:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movw (%eax), %ax
+; CHECK-I686-NEXT: retl
%val = load half, half* %addr
%val_int = bitcast half %val to i16
ret i16 %val_int
}
-define void @test_bitcast_to_half(half* %addr, i16 %in) {
+define void @test_bitcast_to_half(half* %addr, i16 %in) #0 {
; CHECK-LABEL: test_bitcast_to_half:
-; CHECK: movw %si, (%rdi)
+; CHECK: # BB#0:
+; CHECK-NEXT: movw %si, (%rdi)
+; CHECK-NEXT: retq
+;
+; CHECK-I686-LABEL: test_bitcast_to_half:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: movw {{[0-9]+}}(%esp), %ax
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-I686-NEXT: movw %ax, (%ecx)
+; CHECK-I686-NEXT: retl
%val_fp = bitcast i16 %in to half
store half %val_fp, half* %addr
ret void
}
-define float @test_extend32(half* %addr) {
-; CHECK-LABEL: test_extend32:
-
-; CHECK-LIBCALL: jmp __gnu_h2f_ieee
-; CHECK-F16C: vcvtph2ps
+define float @test_extend32(half* %addr) #0 {
+; CHECK-LIBCALL-LABEL: test_extend32:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
+; CHECK-LIBCALL-NEXT: jmp __gnu_h2f_ieee # TAILCALL
+;
+; BWON-F16C-LABEL: test_extend32:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: movswl (%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_extend32:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: subl $12, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movzwl (%eax), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: addl $12, %esp
+; CHECK-I686-NEXT: retl
%val16 = load half, half* %addr
%val32 = fpext half %val16 to float
ret float %val32
}
-define double @test_extend64(half* %addr) {
-; CHECK-LABEL: test_extend64:
-
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-LIBCALL: cvtss2sd
-; CHECK-F16C: vcvtph2ps
-; CHECK-F16C: vcvtss2sd
+define double @test_extend64(half* %addr) #0 {
+; CHECK-LIBCALL-LABEL: test_extend64:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rax
+; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0
+; CHECK-LIBCALL-NEXT: popq %rax
+; CHECK-LIBCALL-NEXT: retq
+;
+; BWON-F16C-LABEL: test_extend64:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: movswl (%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_extend64:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: subl $12, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movzwl (%eax), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: addl $12, %esp
+; CHECK-I686-NEXT: retl
%val16 = load half, half* %addr
%val32 = fpext half %val16 to double
ret double %val32
}
-define void @test_trunc32(float %in, half* %addr) {
-; CHECK-LABEL: test_trunc32:
-
-; CHECK-LIBCALL: callq __gnu_f2h_ieee
-; CHECK-F16C: vcvtps2ph
+define void @test_trunc32(float %in, half* %addr) #0 {
+; CHECK-LIBCALL-LABEL: test_trunc32:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rbx
+; CHECK-LIBCALL-NEXT: movq %rdi, %rbx
+; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT: movw %ax, (%rbx)
+; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: retq
+;
+; BWON-F16C-LABEL: test_trunc32:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; BWON-F16C-NEXT: vmovd %xmm0, %eax
+; BWON-F16C-NEXT: movw %ax, (%rdi)
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_trunc32:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $8, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: movss %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, (%esi)
+; CHECK-I686-NEXT: addl $8, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: retl
%val16 = fptrunc float %in to half
store half %val16, half* %addr
ret void
}
-define void @test_trunc64(double %in, half* %addr) {
+define void @test_trunc64(double %in, half* %addr) #0 {
; CHECK-LABEL: test_trunc64:
-
-; CHECK-LIBCALL: callq __truncdfhf2
-; CHECK-F16C: callq __truncdfhf2
+; CHECK: # BB#0:
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: callq __truncdfhf2
+; CHECK-NEXT: movw %ax, (%rbx)
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: retq
+;
+; CHECK-I686-LABEL: test_trunc64:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $8, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-I686-NEXT: movsd %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __truncdfhf2
+; CHECK-I686-NEXT: movw %ax, (%esi)
+; CHECK-I686-NEXT: addl $8, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: retl
%val16 = fptrunc double %in to half
store half %val16, half* %addr
ret void
}
define i64 @test_fptosi_i64(half* %p) #0 {
-; CHECK-LABEL: test_fptosi_i64:
-
-; CHECK-LIBCALL-NEXT: pushq %rax
-; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
-; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rax
-; CHECK-LIBCALL-NEXT: popq %rcx
-; CHECK-LIBCALL-NEXT: retq
-
-; CHECK-F16C-NEXT: movswl (%rdi), [[REG0:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vmovd [[REG0]], [[REG1:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvtph2ps [[REG1]], [[REG2:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvttss2si [[REG2]], %rax
-; CHECK-F16C-NEXT: retq
+; CHECK-LIBCALL-LABEL: test_fptosi_i64:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rax
+; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rax
+; CHECK-LIBCALL-NEXT: popq %rcx
+; CHECK-LIBCALL-NEXT: retq
+;
+; BWON-F16C-LABEL: test_fptosi_i64:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: movswl (%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: vcvttss2si %xmm0, %rax
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_fptosi_i64:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: subl $12, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movzwl (%eax), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstps (%esp)
+; CHECK-I686-NEXT: calll __fixsfdi
+; CHECK-I686-NEXT: addl $12, %esp
+; CHECK-I686-NEXT: retl
%a = load half, half* %p, align 2
%r = fptosi half %a to i64
ret i64 %r
}
define void @test_sitofp_i64(i64 %a, half* %p) #0 {
-; CHECK-LABEL: test_sitofp_i64:
-
-; CHECK-LIBCALL-NEXT: pushq [[ADDR:%[a-z]+]]
-; CHECK-LIBCALL-NEXT: movq %rsi, [[ADDR]]
-; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0
-; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT: movw %ax, ([[ADDR]])
-; CHECK_LIBCALL-NEXT: popq [[ADDR]]
-; CHECK_LIBCALL-NEXT: retq
-
-; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, [[REG0:%[a-z0-9]+]], [[REG0]]
-; CHECK-F16C-NEXT: vcvtps2ph $4, [[REG0]], [[REG0]]
-; CHECK-F16C-NEXT: vmovd [[REG0]], %eax
-; CHECK-F16C-NEXT: movw %ax, (%rsi)
-; CHECK-F16C-NEXT: retq
+; CHECK-LIBCALL-LABEL: test_sitofp_i64:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rbx
+; CHECK-LIBCALL-NEXT: movq %rsi, %rbx
+; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0
+; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT: movw %ax, (%rbx)
+; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: retq
+;
+; BWON-F16C-LABEL: test_sitofp_i64:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; BWON-F16C-NEXT: vmovd %xmm0, %eax
+; BWON-F16C-NEXT: movw %ax, (%rsi)
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_sitofp_i64:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $24, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-I686-NEXT: movlps %xmm0, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fildll {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: movss %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, (%esi)
+; CHECK-I686-NEXT: addl $24, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: retl
%r = sitofp i64 %a to half
store half %r, half* %p
ret void
}
define i64 @test_fptoui_i64(half* %p) #0 {
-; CHECK-LABEL: test_fptoui_i64:
-
-; FP_TO_UINT is expanded using FP_TO_SINT
-; CHECK-LIBCALL-NEXT: pushq %rax
-; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
-; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: movss {{.[A-Z_0-9]+}}(%rip), [[REG1:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: movaps %xmm0, [[REG2:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: subss [[REG1]], [[REG2]]
-; CHECK-LIBCALL-NEXT: cvttss2si [[REG2]], [[REG3:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: movabsq $-9223372036854775808, [[REG4:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: xorq [[REG3]], [[REG4]]
-; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, [[REG5:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: ucomiss [[REG1]], %xmm0
-; CHECK-LIBCALL-NEXT: cmovaeq [[REG4]], [[REG5]]
-; CHECK-LIBCALL-NEXT: popq %rcx
-; CHECK-LIBCALL-NEXT: retq
-
-; CHECK-F16C-NEXT: movswl (%rdi), [[REG0:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vmovd [[REG0]], [[REG1:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvtph2ps [[REG1]], [[REG2:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vmovss {{.[A-Z_0-9]+}}(%rip), [[REG3:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vsubss [[REG3]], [[REG2]], [[REG4:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvttss2si [[REG4]], [[REG5:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: movabsq $-9223372036854775808, [[REG6:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: xorq [[REG5]], [[REG6:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvttss2si [[REG2]], [[REG7:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vucomiss [[REG3]], [[REG2]]
-; CHECK-F16C-NEXT: cmovaeq [[REG6]], %rax
-; CHECK-F16C-NEXT: retq
+; CHECK-LIBCALL-LABEL: test_fptoui_i64:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rax
+; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-LIBCALL-NEXT: movaps %xmm0, %xmm2
+; CHECK-LIBCALL-NEXT: subss %xmm1, %xmm2
+; CHECK-LIBCALL-NEXT: cvttss2si %xmm2, %rax
+; CHECK-LIBCALL-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; CHECK-LIBCALL-NEXT: xorq %rax, %rcx
+; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rax
+; CHECK-LIBCALL-NEXT: ucomiss %xmm1, %xmm0
+; CHECK-LIBCALL-NEXT: cmovaeq %rcx, %rax
+; CHECK-LIBCALL-NEXT: popq %rcx
+; CHECK-LIBCALL-NEXT: retq
+;
+; BWON-F16C-LABEL: test_fptoui_i64:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: movswl (%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; BWON-F16C-NEXT: vsubss %xmm1, %xmm0, %xmm2
+; BWON-F16C-NEXT: vcvttss2si %xmm2, %rax
+; BWON-F16C-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; BWON-F16C-NEXT: xorq %rax, %rcx
+; BWON-F16C-NEXT: vcvttss2si %xmm0, %rax
+; BWON-F16C-NEXT: vucomiss %xmm1, %xmm0
+; BWON-F16C-NEXT: cmovaeq %rcx, %rax
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_fptoui_i64:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: subl $12, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movzwl (%eax), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstps (%esp)
+; CHECK-I686-NEXT: calll __fixunssfdi
+; CHECK-I686-NEXT: addl $12, %esp
+; CHECK-I686-NEXT: retl
%a = load half, half* %p, align 2
%r = fptoui half %a to i64
ret i64 %r
}
define void @test_uitofp_i64(i64 %a, half* %p) #0 {
-; CHECK-LABEL: test_uitofp_i64:
-; CHECK-LIBCALL-NEXT: pushq [[ADDR:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: movq %rsi, [[ADDR]]
-; CHECK-NEXT: testq %rdi, %rdi
-; CHECK-NEXT: js [[LABEL1:.LBB[0-9_]+]]
-
-; simple conversion to float if non-negative
-; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, [[REG1:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, [[REG1:%[a-z0-9]+]], [[REG1]]
-; CHECK-NEXT: jmp [[LABEL2:.LBB[0-9_]+]]
-
-; convert using shift+or if negative
-; CHECK-NEXT: [[LABEL1]]:
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: andl $1, %edi
-; CHECK-NEXT: orq %rax, [[REG2:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: cvtsi2ssq [[REG2]], [[REG3:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: addss [[REG3]], [[REG1]]
-; CHECK-F16C-NEXT: vcvtsi2ssq [[REG2]], [[REG3:%[a-z0-9]+]], [[REG3]]
-; CHECK-F16C-NEXT: vaddss [[REG3]], [[REG3]], [[REG1:[%a-z0-9]+]]
-
-; convert float to half
-; CHECK-NEXT: [[LABEL2]]:
-; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT: movw %ax, ([[ADDR]])
-; CHECK-LIBCALL-NEXT: popq [[ADDR]]
-; CHECK-F16C-NEXT: vcvtps2ph $4, [[REG1]], [[REG4:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vmovd [[REG4]], %eax
-; CHECK-F16C-NEXT: movw %ax, (%rsi)
-; CHECK-NEXT: retq
-
+; CHECK-LIBCALL-LABEL: test_uitofp_i64:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rbx
+; CHECK-LIBCALL-NEXT: movq %rsi, %rbx
+; CHECK-LIBCALL-NEXT: testq %rdi, %rdi
+; CHECK-LIBCALL-NEXT: js .LBB10_1
+; CHECK-LIBCALL-NEXT: # BB#2:
+; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0
+; CHECK-LIBCALL-NEXT: jmp .LBB10_3
+; CHECK-LIBCALL-NEXT: .LBB10_1:
+; CHECK-LIBCALL-NEXT: movq %rdi, %rax
+; CHECK-LIBCALL-NEXT: shrq %rax
+; CHECK-LIBCALL-NEXT: andl $1, %edi
+; CHECK-LIBCALL-NEXT: orq %rax, %rdi
+; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0
+; CHECK-LIBCALL-NEXT: addss %xmm0, %xmm0
+; CHECK-LIBCALL-NEXT: .LBB10_3:
+; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT: movw %ax, (%rbx)
+; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: retq
+;
+; BWON-F16C-LABEL: test_uitofp_i64:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: testq %rdi, %rdi
+; BWON-F16C-NEXT: js .LBB10_1
+; BWON-F16C-NEXT: # BB#2:
+; BWON-F16C-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0
+; BWON-F16C-NEXT: jmp .LBB10_3
+; BWON-F16C-NEXT: .LBB10_1:
+; BWON-F16C-NEXT: movq %rdi, %rax
+; BWON-F16C-NEXT: shrq %rax
+; BWON-F16C-NEXT: andl $1, %edi
+; BWON-F16C-NEXT: orq %rax, %rdi
+; BWON-F16C-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0
+; BWON-F16C-NEXT: vaddss %xmm0, %xmm0, %xmm0
+; BWON-F16C-NEXT: .LBB10_3:
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; BWON-F16C-NEXT: vmovd %xmm0, %eax
+; BWON-F16C-NEXT: movw %ax, (%rsi)
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_uitofp_i64:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $24, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-I686-NEXT: movlps %xmm0, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: xorl %eax, %eax
+; CHECK-I686-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: setns %al
+; CHECK-I686-NEXT: fildll {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; CHECK-I686-NEXT: fstps (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, (%esi)
+; CHECK-I686-NEXT: addl $24, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: retl
%r = uitofp i64 %a to half
store half %r, half* %p
ret void
}
define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
-; CHECK-LABEL: test_extend32_vec4:
-
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-F16C: vcvtph2ps
-; CHECK-F16C: vcvtph2ps
-; CHECK-F16C: vcvtph2ps
-; CHECK-F16C: vcvtph2ps
+; CHECK-LIBCALL-LABEL: test_extend32_vec4:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rbx
+; CHECK-LIBCALL-NEXT: subq $48, %rsp
+; CHECK-LIBCALL-NEXT: movq %rdi, %rbx
+; CHECK-LIBCALL-NEXT: movzwl (%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movzwl 2(%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movzwl 4(%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movzwl 6(%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-LIBCALL-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-LIBCALL-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-LIBCALL-NEXT: unpcklps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-LIBCALL-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-LIBCALL-NEXT: addq $48, %rsp
+; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: retq
+;
+; BWON-F16C-LABEL: test_extend32_vec4:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: movswl 6(%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: movswl 4(%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm1
+; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
+; BWON-F16C-NEXT: movswl (%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm2
+; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2
+; BWON-F16C-NEXT: movswl 2(%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm3
+; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3
+; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_extend32_vec4:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $56, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT: movzwl 2(%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT: movzwl 4(%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT: movzwl 6(%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: movzwl (%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-I686-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-I686-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-I686-NEXT: addl $56, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: retl
%a = load <4 x half>, <4 x half>* %p, align 8
%b = fpext <4 x half> %a to <4 x float>
ret <4 x float> %b
}
define <4 x double> @test_extend64_vec4(<4 x half>* %p) #0 {
-; CHECK-LABEL: test_extend64_vec4
-
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-DAG: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-DAG: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-DAG: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-DAG: cvtss2sd
-; CHECK-LIBCALL-DAG: cvtss2sd
-; CHECK-LIBCALL-DAG: cvtss2sd
-; CHECK-LIBCALL: cvtss2sd
-; CHECK-F16C: vcvtph2ps
-; CHECK-F16C-DAG: vcvtph2ps
-; CHECK-F16C-DAG: vcvtph2ps
-; CHECK-F16C-DAG: vcvtph2ps
-; CHECK-F16C-DAG: vcvtss2sd
-; CHECK-F16C-DAG: vcvtss2sd
-; CHECK-F16C-DAG: vcvtss2sd
-; CHECK-F16C: vcvtss2sd
+; CHECK-LIBCALL-LABEL: test_extend64_vec4:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rbx
+; CHECK-LIBCALL-NEXT: subq $16, %rsp
+; CHECK-LIBCALL-NEXT: movq %rdi, %rbx
+; CHECK-LIBCALL-NEXT: movzwl 4(%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-LIBCALL-NEXT: movzwl 6(%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-LIBCALL-NEXT: movzwl (%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-LIBCALL-NEXT: movzwl 2(%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm1
+; CHECK-LIBCALL-NEXT: movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload
+; CHECK-LIBCALL-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0
+; CHECK-LIBCALL-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-LIBCALL-NEXT: movss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Reload
+; CHECK-LIBCALL-NEXT: # xmm1 = mem[0],zero,zero,zero
+; CHECK-LIBCALL-NEXT: cvtss2sd %xmm1, %xmm2
+; CHECK-LIBCALL-NEXT: movss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Reload
+; CHECK-LIBCALL-NEXT: # xmm1 = mem[0],zero,zero,zero
+; CHECK-LIBCALL-NEXT: cvtss2sd %xmm1, %xmm1
+; CHECK-LIBCALL-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; CHECK-LIBCALL-NEXT: addq $16, %rsp
+; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: retq
+;
+; BWON-F16C-LABEL: test_extend64_vec4:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: movswl (%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: movswl 2(%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm1
+; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
+; BWON-F16C-NEXT: movswl 4(%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm2
+; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2
+; BWON-F16C-NEXT: movswl 6(%rdi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm3
+; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3
+; BWON-F16C-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; BWON-F16C-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; BWON-F16C-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; BWON-F16C-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; BWON-F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; BWON-F16C-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BWON-F16C-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_extend64_vec4:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $88, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT: movzwl 6(%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT: movzwl 4(%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT: movzwl 2(%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT: movzwl (%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-I686-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-I686-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-I686-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; CHECK-I686-NEXT: addl $88, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: retl
%a = load <4 x half>, <4 x half>* %p, align 8
%b = fpext <4 x half> %a to <4 x double>
ret <4 x double> %b
}
-define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) {
-; CHECK-LABEL: test_trunc32_vec4:
-
-; CHECK-LIBCALL: callq __gnu_f2h_ieee
-; CHECK-LIBCALL: callq __gnu_f2h_ieee
-; CHECK-LIBCALL: callq __gnu_f2h_ieee
-; CHECK-LIBCALL: callq __gnu_f2h_ieee
-; CHECK-F16C: vcvtps2ph
-; CHECK-F16C: vcvtps2ph
-; CHECK-F16C: vcvtps2ph
-; CHECK-F16C: vcvtps2ph
-; CHECK: movw
-; CHECK: movw
-; CHECK: movw
-; CHECK: movw
+define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) #0 {
+; BWON-NOF16C-LABEL: test_trunc32_vec4:
+; BWON-NOF16C: # BB#0:
+; BWON-NOF16C-NEXT: pushq %rbp
+; BWON-NOF16C-NEXT: pushq %r15
+; BWON-NOF16C-NEXT: pushq %r14
+; BWON-NOF16C-NEXT: pushq %rbx
+; BWON-NOF16C-NEXT: subq $24, %rsp
+; BWON-NOF16C-NEXT: movq %rdi, %rbx
+; BWON-NOF16C-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; BWON-NOF16C-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; BWON-NOF16C-NEXT: callq __gnu_f2h_ieee
+; BWON-NOF16C-NEXT: movl %eax, %r14d
+; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWON-NOF16C-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWON-NOF16C-NEXT: callq __gnu_f2h_ieee
+; BWON-NOF16C-NEXT: movl %eax, %r15d
+; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWON-NOF16C-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; BWON-NOF16C-NEXT: callq __gnu_f2h_ieee
+; BWON-NOF16C-NEXT: movl %eax, %ebp
+; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWON-NOF16C-NEXT: callq __gnu_f2h_ieee
+; BWON-NOF16C-NEXT: movw %ax, (%rbx)
+; BWON-NOF16C-NEXT: movw %bp, 6(%rbx)
+; BWON-NOF16C-NEXT: movw %r15w, 4(%rbx)
+; BWON-NOF16C-NEXT: movw %r14w, 2(%rbx)
+; BWON-NOF16C-NEXT: addq $24, %rsp
+; BWON-NOF16C-NEXT: popq %rbx
+; BWON-NOF16C-NEXT: popq %r14
+; BWON-NOF16C-NEXT: popq %r15
+; BWON-NOF16C-NEXT: popq %rbp
+; BWON-NOF16C-NEXT: retq
+;
+; BWOFF-LABEL: test_trunc32_vec4:
+; BWOFF: # BB#0:
+; BWOFF-NEXT: pushq %rbp
+; BWOFF-NEXT: pushq %r15
+; BWOFF-NEXT: pushq %r14
+; BWOFF-NEXT: pushq %rbx
+; BWOFF-NEXT: subq $24, %rsp
+; BWOFF-NEXT: movq %rdi, %rbx
+; BWOFF-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; BWOFF-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; BWOFF-NEXT: callq __gnu_f2h_ieee
+; BWOFF-NEXT: movw %ax, %r14w
+; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWOFF-NEXT: callq __gnu_f2h_ieee
+; BWOFF-NEXT: movw %ax, %r15w
+; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; BWOFF-NEXT: callq __gnu_f2h_ieee
+; BWOFF-NEXT: movw %ax, %bp
+; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT: callq __gnu_f2h_ieee
+; BWOFF-NEXT: movw %ax, (%rbx)
+; BWOFF-NEXT: movw %bp, 6(%rbx)
+; BWOFF-NEXT: movw %r15w, 4(%rbx)
+; BWOFF-NEXT: movw %r14w, 2(%rbx)
+; BWOFF-NEXT: addq $24, %rsp
+; BWOFF-NEXT: popq %rbx
+; BWOFF-NEXT: popq %r14
+; BWOFF-NEXT: popq %r15
+; BWOFF-NEXT: popq %rbp
+; BWOFF-NEXT: retq
+;
+; BWON-F16C-LABEL: test_trunc32_vec4:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; BWON-F16C-NEXT: vmovd %xmm1, %eax
+; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; BWON-F16C-NEXT: vmovd %xmm1, %ecx
+; BWON-F16C-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; BWON-F16C-NEXT: vmovd %xmm1, %edx
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; BWON-F16C-NEXT: vmovd %xmm0, %esi
+; BWON-F16C-NEXT: movw %si, (%rdi)
+; BWON-F16C-NEXT: movw %dx, 6(%rdi)
+; BWON-F16C-NEXT: movw %cx, 4(%rdi)
+; BWON-F16C-NEXT: movw %ax, 2(%rdi)
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_trunc32_vec4:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %ebp
+; CHECK-I686-NEXT: pushl %ebx
+; CHECK-I686-NEXT: pushl %edi
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $44, %esp
+; CHECK-I686-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) # 16-byte Spill
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; CHECK-I686-NEXT: movaps %xmm0, %xmm1
+; CHECK-I686-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; CHECK-I686-NEXT: movss %xmm1, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, %si
+; CHECK-I686-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-I686-NEXT: movss %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, %di
+; CHECK-I686-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; CHECK-I686-NEXT: movss %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, %bx
+; CHECK-I686-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: movss %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, (%ebp)
+; CHECK-I686-NEXT: movw %bx, 6(%ebp)
+; CHECK-I686-NEXT: movw %di, 4(%ebp)
+; CHECK-I686-NEXT: movw %si, 2(%ebp)
+; CHECK-I686-NEXT: addl $44, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: popl %edi
+; CHECK-I686-NEXT: popl %ebx
+; CHECK-I686-NEXT: popl %ebp
+; CHECK-I686-NEXT: retl
%v = fptrunc <4 x float> %a to <4 x half>
store <4 x half> %v, <4 x half>* %p
ret void
}
-define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) {
-; CHECK-LABEL: test_trunc64_vec4:
-; CHECK: callq __truncdfhf2
-; CHECK: callq __truncdfhf2
-; CHECK: callq __truncdfhf2
-; CHECK: callq __truncdfhf2
-; CHECK: movw
-; CHECK: movw
-; CHECK: movw
-; CHECK: movw
+define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) #0 {
+; BWON-NOF16C-LABEL: test_trunc64_vec4:
+; BWON-NOF16C: # BB#0:
+; BWON-NOF16C-NEXT: pushq %rbp
+; BWON-NOF16C-NEXT: pushq %r15
+; BWON-NOF16C-NEXT: pushq %r14
+; BWON-NOF16C-NEXT: pushq %rbx
+; BWON-NOF16C-NEXT: subq $40, %rsp
+; BWON-NOF16C-NEXT: movq %rdi, %rbx
+; BWON-NOF16C-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
+; BWON-NOF16C-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; BWON-NOF16C-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWON-NOF16C-NEXT: callq __truncdfhf2
+; BWON-NOF16C-NEXT: movl %eax, %r14d
+; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWON-NOF16C-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWON-NOF16C-NEXT: callq __truncdfhf2
+; BWON-NOF16C-NEXT: movl %eax, %r15d
+; BWON-NOF16C-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; BWON-NOF16C-NEXT: callq __truncdfhf2
+; BWON-NOF16C-NEXT: movl %eax, %ebp
+; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWON-NOF16C-NEXT: callq __truncdfhf2
+; BWON-NOF16C-NEXT: movw %ax, 4(%rbx)
+; BWON-NOF16C-NEXT: movw %bp, (%rbx)
+; BWON-NOF16C-NEXT: movw %r15w, 6(%rbx)
+; BWON-NOF16C-NEXT: movw %r14w, 2(%rbx)
+; BWON-NOF16C-NEXT: addq $40, %rsp
+; BWON-NOF16C-NEXT: popq %rbx
+; BWON-NOF16C-NEXT: popq %r14
+; BWON-NOF16C-NEXT: popq %r15
+; BWON-NOF16C-NEXT: popq %rbp
+; BWON-NOF16C-NEXT: retq
+;
+; BWOFF-LABEL: test_trunc64_vec4:
+; BWOFF: # BB#0:
+; BWOFF-NEXT: pushq %rbp
+; BWOFF-NEXT: pushq %r15
+; BWOFF-NEXT: pushq %r14
+; BWOFF-NEXT: pushq %rbx
+; BWOFF-NEXT: subq $40, %rsp
+; BWOFF-NEXT: movq %rdi, %rbx
+; BWOFF-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
+; BWOFF-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; BWOFF-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWOFF-NEXT: callq __truncdfhf2
+; BWOFF-NEXT: movw %ax, %r14w
+; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWOFF-NEXT: callq __truncdfhf2
+; BWOFF-NEXT: movw %ax, %r15w
+; BWOFF-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT: callq __truncdfhf2
+; BWOFF-NEXT: movw %ax, %bp
+; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT: callq __truncdfhf2
+; BWOFF-NEXT: movw %ax, 4(%rbx)
+; BWOFF-NEXT: movw %bp, (%rbx)
+; BWOFF-NEXT: movw %r15w, 6(%rbx)
+; BWOFF-NEXT: movw %r14w, 2(%rbx)
+; BWOFF-NEXT: addq $40, %rsp
+; BWOFF-NEXT: popq %rbx
+; BWOFF-NEXT: popq %r14
+; BWOFF-NEXT: popq %r15
+; BWOFF-NEXT: popq %rbp
+; BWOFF-NEXT: retq
+;
+; BWON-F16C-LABEL: test_trunc64_vec4:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: pushq %rbp
+; BWON-F16C-NEXT: pushq %r15
+; BWON-F16C-NEXT: pushq %r14
+; BWON-F16C-NEXT: pushq %rbx
+; BWON-F16C-NEXT: subq $88, %rsp
+; BWON-F16C-NEXT: movq %rdi, %rbx
+; BWON-F16C-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; BWON-F16C-NEXT: vzeroupper
+; BWON-F16C-NEXT: callq __truncdfhf2
+; BWON-F16C-NEXT: movl %eax, %r14d
+; BWON-F16C-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; BWON-F16C-NEXT: vextractf128 $1, %ymm0, %xmm0
+; BWON-F16C-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; BWON-F16C-NEXT: vzeroupper
+; BWON-F16C-NEXT: callq __truncdfhf2
+; BWON-F16C-NEXT: movl %eax, %r15d
+; BWON-F16C-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; BWON-F16C-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; BWON-F16C-NEXT: vzeroupper
+; BWON-F16C-NEXT: callq __truncdfhf2
+; BWON-F16C-NEXT: movl %eax, %ebp
+; BWON-F16C-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; BWON-F16C-NEXT: callq __truncdfhf2
+; BWON-F16C-NEXT: movw %ax, 4(%rbx)
+; BWON-F16C-NEXT: movw %bp, (%rbx)
+; BWON-F16C-NEXT: movw %r15w, 6(%rbx)
+; BWON-F16C-NEXT: movw %r14w, 2(%rbx)
+; BWON-F16C-NEXT: addq $88, %rsp
+; BWON-F16C-NEXT: popq %rbx
+; BWON-F16C-NEXT: popq %r14
+; BWON-F16C-NEXT: popq %r15
+; BWON-F16C-NEXT: popq %rbp
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_trunc64_vec4:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %ebp
+; CHECK-I686-NEXT: pushl %ebx
+; CHECK-I686-NEXT: pushl %edi
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $60, %esp
+; CHECK-I686-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) # 16-byte Spill
+; CHECK-I686-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) # 16-byte Spill
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; CHECK-I686-NEXT: movlps %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __truncdfhf2
+; CHECK-I686-NEXT: movw %ax, %si
+; CHECK-I686-NEXT: movapd {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: movhpd %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __truncdfhf2
+; CHECK-I686-NEXT: movw %ax, %di
+; CHECK-I686-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: movlps %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __truncdfhf2
+; CHECK-I686-NEXT: movw %ax, %bx
+; CHECK-I686-NEXT: movapd {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: movhpd %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __truncdfhf2
+; CHECK-I686-NEXT: movw %ax, 6(%ebp)
+; CHECK-I686-NEXT: movw %bx, 4(%ebp)
+; CHECK-I686-NEXT: movw %di, 2(%ebp)
+; CHECK-I686-NEXT: movw %si, (%ebp)
+; CHECK-I686-NEXT: addl $60, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: popl %edi
+; CHECK-I686-NEXT: popl %ebx
+; CHECK-I686-NEXT: popl %ebp
+; CHECK-I686-NEXT: retl
%v = fptrunc <4 x double> %a to <4 x half>
store <4 x half> %v, <4 x half>* %p
ret void
@@ -272,40 +839,98 @@ declare float @test_floatret();
; to f80 and then rounded to f32. The DAG combiner should not combine this
; fp_round and the subsequent fptrunc from float to half.
define half @test_f80trunc_nodagcombine() #0 {
-; CHECK-LABEL: test_f80trunc_nodagcombine:
-; CHECK-I686-NOT: calll __truncxfhf2
+; CHECK-LIBCALL-LABEL: test_f80trunc_nodagcombine:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rax
+; CHECK-LIBCALL-NEXT: callq test_floatret
+; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT: movzwl %ax, %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: popq %rax
+; CHECK-LIBCALL-NEXT: retq
+;
+; BWON-F16C-LABEL: test_f80trunc_nodagcombine:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: pushq %rax
+; BWON-F16C-NEXT: callq test_floatret
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: popq %rax
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_f80trunc_nodagcombine:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: subl $12, %esp
+; CHECK-I686-NEXT: calll test_floatret
+; CHECK-I686-NEXT: fstps (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movzwl %ax, %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: addl $12, %esp
+; CHECK-I686-NEXT: retl
%1 = call float @test_floatret()
%2 = fptrunc float %1 to half
ret half %2
}
-; CHECK-LABEL: test_sitofp_fadd_i32:
-; CHECK-LIBCALL-NEXT: pushq %rbx
-; CHECK-LIBCALL-NEXT: subq $16, %rsp
-; CHECK-LIBCALL-NEXT: movl %edi, %ebx
-; CHECK-LIBCALL-NEXT: movzwl (%rsi), %edi
-; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: movss %xmm0, 12(%rsp)
-; CHECK-LIBCALL-NEXT: cvtsi2ssl %ebx, %xmm0
-; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT: movzwl %ax, %edi
-; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: addss 12(%rsp), %xmm0
-; CHECK-LIBCALL-NEXT: addq $16, %rsp
-; CHECK-LIBCALL-NEXT: popq %rbx
-; CHECK-LIBCALL-NEXT: retq
-; CHECK-F16C-NEXT: movswl (%rsi), %eax
-; CHECK-F16C-NEXT: vmovd %eax, %xmm0
-; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
-; CHECK-F16C-NEXT: vcvtsi2ssl %edi, %xmm1, %xmm1
-; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; CHECK-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
-; CHECK-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-F16C-NEXT: retq
define float @test_sitofp_fadd_i32(i32 %a, half* %b) #0 {
+; CHECK-LIBCALL-LABEL: test_sitofp_fadd_i32:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rbx
+; CHECK-LIBCALL-NEXT: subq $16, %rsp
+; CHECK-LIBCALL-NEXT: movl %edi, %ebx
+; CHECK-LIBCALL-NEXT: movzwl (%rsi), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-LIBCALL-NEXT: cvtsi2ssl %ebx, %xmm0
+; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT: movzwl %ax, %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: addss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT: addq $16, %rsp
+; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: retq
+;
+; BWON-F16C-LABEL: test_sitofp_fadd_i32:
+; BWON-F16C: # BB#0:
+; BWON-F16C-NEXT: movswl (%rsi), %eax
+; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: vcvtsi2ssl %edi, %xmm1, %xmm1
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
+; BWON-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; BWON-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_sitofp_fadd_i32:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: subl $28, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movzwl (%eax), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: movss %xmm0, {{[0-9]+}}(%esp) # 4-byte Spill
+; CHECK-I686-NEXT: xorps %xmm0, %xmm0
+; CHECK-I686-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
+; CHECK-I686-NEXT: movss %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movzwl %ax, %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: movss {{[0-9]+}}(%esp), %xmm0 # 4-byte Reload
+; CHECK-I686-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: addss {{[0-9]+}}(%esp), %xmm0
+; CHECK-I686-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: flds {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: addl $28, %esp
+; CHECK-I686-NEXT: retl
%tmp0 = load half, half* %b
%tmp1 = sitofp i32 %a to half
%tmp2 = fadd half %tmp0, %tmp1
diff --git a/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/test/CodeGen/X86/illegal-bitfield-loadstore.ll
index ceb465711906..5425670fbb1e 100644
--- a/test/CodeGen/X86/illegal-bitfield-loadstore.ll
+++ b/test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -1,17 +1,30 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X64
define void @i24_or(i24* %a) {
-; CHECK-LABEL: i24_or:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzwl (%rdi), %eax
-; CHECK-NEXT: movzbl 2(%rdi), %ecx
-; CHECK-NEXT: movb %cl, 2(%rdi)
-; CHECK-NEXT: shll $16, %ecx
-; CHECK-NEXT: orl %eax, %ecx
-; CHECK-NEXT: orl $384, %ecx # imm = 0x180
-; CHECK-NEXT: movw %cx, (%rdi)
-; CHECK-NEXT: retq
+; X86-LABEL: i24_or:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl (%ecx), %edx
+; X86-NEXT: movzbl 2(%ecx), %eax
+; X86-NEXT: movb %al, 2(%ecx)
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: orl $384, %eax # imm = 0x180
+; X86-NEXT: movw %ax, (%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: i24_or:
+; X64: # BB#0:
+; X64-NEXT: movzwl (%rdi), %eax
+; X64-NEXT: movzbl 2(%rdi), %ecx
+; X64-NEXT: movb %cl, 2(%rdi)
+; X64-NEXT: shll $16, %ecx
+; X64-NEXT: orl %eax, %ecx
+; X64-NEXT: orl $384, %ecx # imm = 0x180
+; X64-NEXT: movw %cx, (%rdi)
+; X64-NEXT: retq
%aa = load i24, i24* %a, align 1
%b = or i24 %aa, 384
store i24 %b, i24* %a, align 1
@@ -19,17 +32,30 @@ define void @i24_or(i24* %a) {
}
define void @i24_and_or(i24* %a) {
-; CHECK-LABEL: i24_and_or:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzwl (%rdi), %eax
-; CHECK-NEXT: movzbl 2(%rdi), %ecx
-; CHECK-NEXT: movb %cl, 2(%rdi)
-; CHECK-NEXT: shll $16, %ecx
-; CHECK-NEXT: orl %eax, %ecx
-; CHECK-NEXT: orl $384, %ecx # imm = 0x180
-; CHECK-NEXT: andl $16777088, %ecx # imm = 0xFFFF80
-; CHECK-NEXT: movw %cx, (%rdi)
-; CHECK-NEXT: retq
+; X86-LABEL: i24_and_or:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl (%ecx), %edx
+; X86-NEXT: movzbl 2(%ecx), %eax
+; X86-NEXT: movb %al, 2(%ecx)
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: orl $384, %eax # imm = 0x180
+; X86-NEXT: andl $16777088, %eax # imm = 0xFFFF80
+; X86-NEXT: movw %ax, (%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: i24_and_or:
+; X64: # BB#0:
+; X64-NEXT: movzwl (%rdi), %eax
+; X64-NEXT: movzbl 2(%rdi), %ecx
+; X64-NEXT: movb %cl, 2(%rdi)
+; X64-NEXT: shll $16, %ecx
+; X64-NEXT: orl %eax, %ecx
+; X64-NEXT: orl $384, %ecx # imm = 0x180
+; X64-NEXT: andl $16777088, %ecx # imm = 0xFFFF80
+; X64-NEXT: movw %cx, (%rdi)
+; X64-NEXT: retq
%b = load i24, i24* %a, align 1
%c = and i24 %b, -128
%d = or i24 %c, 384
@@ -38,19 +64,40 @@ define void @i24_and_or(i24* %a) {
}
define void @i24_insert_bit(i24* %a, i1 zeroext %bit) {
-; CHECK-LABEL: i24_insert_bit:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: movzwl (%rdi), %ecx
-; CHECK-NEXT: movzbl 2(%rdi), %edx
-; CHECK-NEXT: movb %dl, 2(%rdi)
-; CHECK-NEXT: shll $16, %edx
-; CHECK-NEXT: orl %ecx, %edx
-; CHECK-NEXT: shll $13, %eax
-; CHECK-NEXT: andl $16769023, %edx # imm = 0xFFDFFF
-; CHECK-NEXT: orl %eax, %edx
-; CHECK-NEXT: movw %dx, (%rdi)
-; CHECK-NEXT: retq
+; X86-LABEL: i24_insert_bit:
+; X86: # BB#0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: .Lcfi0:
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .Lcfi1:
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movzwl (%ecx), %esi
+; X86-NEXT: movzbl 2(%ecx), %eax
+; X86-NEXT: movb %al, 2(%ecx)
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: shll $13, %edx
+; X86-NEXT: andl $16769023, %eax # imm = 0xFFDFFF
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movw %ax, (%ecx)
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: i24_insert_bit:
+; X64: # BB#0:
+; X64-NEXT: movzbl %sil, %eax
+; X64-NEXT: movzwl (%rdi), %ecx
+; X64-NEXT: movzbl 2(%rdi), %edx
+; X64-NEXT: movb %dl, 2(%rdi)
+; X64-NEXT: shll $16, %edx
+; X64-NEXT: orl %ecx, %edx
+; X64-NEXT: shll $13, %eax
+; X64-NEXT: andl $16769023, %edx # imm = 0xFFDFFF
+; X64-NEXT: orl %eax, %edx
+; X64-NEXT: movw %dx, (%rdi)
+; X64-NEXT: retq
%extbit = zext i1 %bit to i24
%b = load i24, i24* %a, align 1
%extbit.shl = shl nuw nsw i24 %extbit, 13
@@ -61,22 +108,28 @@ define void @i24_insert_bit(i24* %a, i1 zeroext %bit) {
}
define void @i56_or(i56* %a) {
-; CHECK-LABEL: i56_or:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzwl 4(%rdi), %eax
-; CHECK-NEXT: movzbl 6(%rdi), %ecx
-; CHECK-NEXT: movl (%rdi), %edx
-; CHECK-NEXT: movb %cl, 6(%rdi)
-; CHECK-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def>
-; CHECK-NEXT: shll $16, %ecx
-; CHECK-NEXT: orl %eax, %ecx
-; CHECK-NEXT: shlq $32, %rcx
-; CHECK-NEXT: orq %rcx, %rdx
-; CHECK-NEXT: orq $384, %rdx # imm = 0x180
-; CHECK-NEXT: movl %edx, (%rdi)
-; CHECK-NEXT: shrq $32, %rdx
-; CHECK-NEXT: movw %dx, 4(%rdi)
-; CHECK-NEXT: retq
+; X86-LABEL: i56_or:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: orl $384, (%eax) # imm = 0x180
+; X86-NEXT: retl
+;
+; X64-LABEL: i56_or:
+; X64: # BB#0:
+; X64-NEXT: movzwl 4(%rdi), %eax
+; X64-NEXT: movzbl 6(%rdi), %ecx
+; X64-NEXT: movl (%rdi), %edx
+; X64-NEXT: movb %cl, 6(%rdi)
+; X64-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def>
+; X64-NEXT: shll $16, %ecx
+; X64-NEXT: orl %eax, %ecx
+; X64-NEXT: shlq $32, %rcx
+; X64-NEXT: orq %rcx, %rdx
+; X64-NEXT: orq $384, %rdx # imm = 0x180
+; X64-NEXT: movl %edx, (%rdi)
+; X64-NEXT: shrq $32, %rdx
+; X64-NEXT: movw %dx, 4(%rdi)
+; X64-NEXT: retq
%aa = load i56, i56* %a, align 1
%b = or i56 %aa, 384
store i56 %b, i56* %a, align 1
@@ -84,24 +137,33 @@ define void @i56_or(i56* %a) {
}
define void @i56_and_or(i56* %a) {
-; CHECK-LABEL: i56_and_or:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzwl 4(%rdi), %eax
-; CHECK-NEXT: movzbl 6(%rdi), %ecx
-; CHECK-NEXT: movl (%rdi), %edx
-; CHECK-NEXT: movb %cl, 6(%rdi)
-; CHECK-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def>
-; CHECK-NEXT: shll $16, %ecx
-; CHECK-NEXT: orl %eax, %ecx
-; CHECK-NEXT: shlq $32, %rcx
-; CHECK-NEXT: orq %rcx, %rdx
-; CHECK-NEXT: orq $384, %rdx # imm = 0x180
-; CHECK-NEXT: movabsq $72057594037927808, %rax # imm = 0xFFFFFFFFFFFF80
-; CHECK-NEXT: andq %rdx, %rax
-; CHECK-NEXT: movl %eax, (%rdi)
-; CHECK-NEXT: shrq $32, %rax
-; CHECK-NEXT: movw %ax, 4(%rdi)
-; CHECK-NEXT: retq
+; X86-LABEL: i56_and_or:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl $384, %ecx # imm = 0x180
+; X86-NEXT: orl (%eax), %ecx
+; X86-NEXT: andl $-128, %ecx
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: retl
+;
+; X64-LABEL: i56_and_or:
+; X64: # BB#0:
+; X64-NEXT: movzwl 4(%rdi), %eax
+; X64-NEXT: movzbl 6(%rdi), %ecx
+; X64-NEXT: movl (%rdi), %edx
+; X64-NEXT: movb %cl, 6(%rdi)
+; X64-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def>
+; X64-NEXT: shll $16, %ecx
+; X64-NEXT: orl %eax, %ecx
+; X64-NEXT: shlq $32, %rcx
+; X64-NEXT: orq %rcx, %rdx
+; X64-NEXT: orq $384, %rdx # imm = 0x180
+; X64-NEXT: movabsq $72057594037927808, %rax # imm = 0xFFFFFFFFFFFF80
+; X64-NEXT: andq %rdx, %rax
+; X64-NEXT: movl %eax, (%rdi)
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: movw %ax, 4(%rdi)
+; X64-NEXT: retq
%b = load i56, i56* %a, align 1
%c = and i56 %b, -128
%d = or i56 %c, 384
@@ -110,26 +172,37 @@ define void @i56_and_or(i56* %a) {
}
define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
-; CHECK-LABEL: i56_insert_bit:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: movzwl 4(%rdi), %ecx
-; CHECK-NEXT: movzbl 6(%rdi), %edx
-; CHECK-NEXT: movl (%rdi), %esi
-; CHECK-NEXT: movb %dl, 6(%rdi)
-; CHECK-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill> %RDX<def>
-; CHECK-NEXT: shll $16, %edx
-; CHECK-NEXT: orl %ecx, %edx
-; CHECK-NEXT: shlq $32, %rdx
-; CHECK-NEXT: orq %rdx, %rsi
-; CHECK-NEXT: shlq $13, %rax
-; CHECK-NEXT: movabsq $72057594037919743, %rcx # imm = 0xFFFFFFFFFFDFFF
-; CHECK-NEXT: andq %rsi, %rcx
-; CHECK-NEXT: orq %rax, %rcx
-; CHECK-NEXT: movl %ecx, (%rdi)
-; CHECK-NEXT: shrq $32, %rcx
-; CHECK-NEXT: movw %cx, 4(%rdi)
-; CHECK-NEXT: retq
+; X86-LABEL: i56_insert_bit:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shll $13, %ecx
+; X86-NEXT: movl $-8193, %edx # imm = 0xDFFF
+; X86-NEXT: andl (%eax), %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl %edx, (%eax)
+; X86-NEXT: retl
+;
+; X64-LABEL: i56_insert_bit:
+; X64: # BB#0:
+; X64-NEXT: movzbl %sil, %eax
+; X64-NEXT: movzwl 4(%rdi), %ecx
+; X64-NEXT: movzbl 6(%rdi), %edx
+; X64-NEXT: movl (%rdi), %esi
+; X64-NEXT: movb %dl, 6(%rdi)
+; X64-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill> %RDX<def>
+; X64-NEXT: shll $16, %edx
+; X64-NEXT: orl %ecx, %edx
+; X64-NEXT: shlq $32, %rdx
+; X64-NEXT: orq %rdx, %rsi
+; X64-NEXT: shlq $13, %rax
+; X64-NEXT: movabsq $72057594037919743, %rcx # imm = 0xFFFFFFFFFFDFFF
+; X64-NEXT: andq %rsi, %rcx
+; X64-NEXT: orq %rax, %rcx
+; X64-NEXT: movl %ecx, (%rdi)
+; X64-NEXT: shrq $32, %rcx
+; X64-NEXT: movw %cx, 4(%rdi)
+; X64-NEXT: retq
%extbit = zext i1 %bit to i56
%b = load i56, i56* %a, align 1
%extbit.shl = shl nuw nsw i56 %extbit, 13
diff --git a/test/CodeGen/X86/optimize-max-1.ll b/test/CodeGen/X86/optimize-max-1.ll
index 11e2f9a93a57..08cb86ab3989 100644
--- a/test/CodeGen/X86/optimize-max-1.ll
+++ b/test/CodeGen/X86/optimize-max-1.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=x86-64 | not grep cmov
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
; LSR should be able to eliminate both smax and umax expressions
; in loop trip counts.
@@ -6,6 +7,18 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
define void @fs(double* nocapture %p, i64 %n) nounwind {
+; CHECK-LABEL: fs:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB0_1: # %bb
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movq $0, (%rdi,%rax,8)
+; CHECK-NEXT: incq %rax
+; CHECK-NEXT: cmpq %rsi, %rax
+; CHECK-NEXT: jl .LBB0_1
+; CHECK-NEXT: # BB#2: # %return
+; CHECK-NEXT: retq
entry:
%tmp = icmp slt i64 %n, 1 ; <i1> [#uses=1]
%smax = select i1 %tmp, i64 1, i64 %n ; <i64> [#uses=1]
@@ -24,6 +37,18 @@ return: ; preds = %bb
}
define void @bs(double* nocapture %p, i64 %n) nounwind {
+; CHECK-LABEL: bs:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB1_1: # %bb
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movq $0, (%rdi,%rax,8)
+; CHECK-NEXT: incq %rax
+; CHECK-NEXT: cmpq %rsi, %rax
+; CHECK-NEXT: jl .LBB1_1
+; CHECK-NEXT: # BB#2: # %return
+; CHECK-NEXT: retq
entry:
%tmp = icmp sge i64 %n, 1 ; <i1> [#uses=1]
%smax = select i1 %tmp, i64 %n, i64 1 ; <i64> [#uses=1]
@@ -42,6 +67,18 @@ return: ; preds = %bb
}
define void @fu(double* nocapture %p, i64 %n) nounwind {
+; CHECK-LABEL: fu:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB2_1: # %bb
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movq $0, (%rdi,%rax,8)
+; CHECK-NEXT: incq %rax
+; CHECK-NEXT: cmpq %rsi, %rax
+; CHECK-NEXT: jb .LBB2_1
+; CHECK-NEXT: # BB#2: # %return
+; CHECK-NEXT: retq
entry:
%tmp = icmp eq i64 %n, 0 ; <i1> [#uses=1]
%umax = select i1 %tmp, i64 1, i64 %n ; <i64> [#uses=1]
@@ -60,6 +97,18 @@ return: ; preds = %bb
}
define void @bu(double* nocapture %p, i64 %n) nounwind {
+; CHECK-LABEL: bu:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB3_1: # %bb
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movq $0, (%rdi,%rax,8)
+; CHECK-NEXT: incq %rax
+; CHECK-NEXT: cmpq %rsi, %rax
+; CHECK-NEXT: jb .LBB3_1
+; CHECK-NEXT: # BB#2: # %return
+; CHECK-NEXT: retq
entry:
%tmp = icmp ne i64 %n, 0 ; <i1> [#uses=1]
%umax = select i1 %tmp, i64 %n, i64 1 ; <i64> [#uses=1]
diff --git a/test/CodeGen/X86/optimize-max-2.ll b/test/CodeGen/X86/optimize-max-2.ll
index 45b542e2267c..37d2a20975a0 100644
--- a/test/CodeGen/X86/optimize-max-2.ll
+++ b/test/CodeGen/X86/optimize-max-2.ll
@@ -1,8 +1,5 @@
-; RUN: llc < %s -march=x86-64 | grep cmov | count 2
-; RUN: llc < %s -march=x86-64 | FileCheck %s
-
-; CHECK: jne
-; CHECK-NOT: jne
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
; LSR's OptimizeMax function shouldn't try to eliminate this max, because
; it has three operands.
@@ -10,6 +7,24 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
define void @foo(double* nocapture %p, i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: testq %rdx, %rdx
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: cmovneq %rdx, %rax
+; CHECK-NEXT: cmpq %rsi, %rax
+; CHECK-NEXT: cmovbeq %rsi, %rax
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB0_1: # %bb4
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: addsd %xmm0, %xmm0
+; CHECK-NEXT: movsd %xmm0, (%rdi)
+; CHECK-NEXT: addq $8, %rdi
+; CHECK-NEXT: decq %rax
+; CHECK-NEXT: jne .LBB0_1
+; CHECK-NEXT: # BB#2: # %return
+; CHECK-NEXT: retq
entry:
%tmp = icmp eq i64 %y, 0 ; <i1> [#uses=1]
%umax = select i1 %tmp, i64 1, i64 %y ; <i64> [#uses=2]
@@ -30,3 +45,4 @@ bb4: ; preds = %bb4, %entry
return: ; preds = %bb4
ret void
}
+
diff --git a/test/CodeGen/X86/pr15309.ll b/test/CodeGen/X86/pr15309.ll
index e9d9b9e54c13..0301b58def1c 100644
--- a/test/CodeGen/X86/pr15309.ll
+++ b/test/CodeGen/X86/pr15309.ll
@@ -1,15 +1,43 @@
-; RUN: llc < %s -mtriple=i686-linux-pc -mcpu=corei7 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-linux-pc | FileCheck %s
-define void @test_convert_float2_ulong2(<2 x i64>* nocapture %src, <2 x float>* nocapture %dest) noinline {
-L.entry:
- %0 = getelementptr <2 x i64>, <2 x i64>* %src, i32 10
- %1 = load <2 x i64>, <2 x i64>* %0, align 16
- %2 = uitofp <2 x i64> %1 to <2 x float>
- %3 = getelementptr <2 x float>, <2 x float>* %dest, i32 10
- store <2 x float> %2, <2 x float>* %3, align 8
+define void @test_convert_float2_ulong2(<2 x i64>* nocapture %src, <2 x float>* nocapture %dest) nounwind {
+; CHECK-LABEL: test_convert_float2_ulong2:
+; CHECK: # BB#0:
+; CHECK-NEXT: pushl %edi
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: subl $20, %esp
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movl 168(%ecx), %edx
+; CHECK-NEXT: movl 172(%ecx), %esi
+; CHECK-NEXT: movl 160(%ecx), %edi
+; CHECK-NEXT: movl 164(%ecx), %ecx
+; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, (%esp)
+; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: testl %ecx, %ecx
+; CHECK-NEXT: setns %dl
+; CHECK-NEXT: fildll (%esp)
+; CHECK-NEXT: fadds {{\.LCPI.*}}(,%edx,4)
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: testl %esi, %esi
+; CHECK-NEXT: setns %cl
+; CHECK-NEXT: fildll {{[0-9]+}}(%esp)
+; CHECK-NEXT: fadds {{\.LCPI.*}}(,%ecx,4)
+; CHECK-NEXT: fstps 84(%eax)
+; CHECK-NEXT: fstps 80(%eax)
+; CHECK-NEXT: addl $20, %esp
+; CHECK-NEXT: popl %esi
+; CHECK-NEXT: popl %edi
+; CHECK-NEXT: retl
+ %t0 = getelementptr <2 x i64>, <2 x i64>* %src, i32 10
+ %t1 = load <2 x i64>, <2 x i64>* %t0, align 16
+ %t2 = uitofp <2 x i64> %t1 to <2 x float>
+ %t3 = getelementptr <2 x float>, <2 x float>* %dest, i32 10
+ store <2 x float> %t2, <2 x float>* %t3, align 8
ret void
}
-; CHECK: test_convert_float2_ulong2
-; CHECK-NOT: cvtpd2ps
-; CHECK: ret
diff --git a/test/CodeGen/X86/pr23603.ll b/test/CodeGen/X86/pr23603.ll
index 6f856aedb8d5..315e60768613 100644
--- a/test/CodeGen/X86/pr23603.ll
+++ b/test/CodeGen/X86/pr23603.ll
@@ -1,14 +1,29 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
declare void @free_v()
-define void @f(i32* %x, i32 %c32, i32* %y) {
-; CHECK-LABEL: f
+define void @f(i32* %x, i32 %c32, i32* %y) nounwind {
+; CHECK-LABEL: f:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movq %rdx, %r14
+; CHECK-NEXT: movl %esi, %ebp
+; CHECK-NEXT: movl (%rdi), %ebx
+; CHECK-NEXT: callq free_v
+; CHECK-NEXT: testl %ebp, %ebp
+; CHECK-NEXT: je .LBB0_2
+; CHECK-NEXT: # BB#1: # %left
+; CHECK-NEXT: movl %ebx, (%r14)
+; CHECK-NEXT: .LBB0_2: # %merge
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
entry:
%v = load i32, i32* %x, !invariant.load !0
-; CHECK: movl (%rdi), %ebx
-; CHECK: free_v
-; CHECK-NOT: movl (%rdi), %ebx
call void @free_v()
%c = icmp ne i32 %c32, 0
br i1 %c, label %left, label %merge
diff --git a/test/CodeGen/X86/pr33715.ll b/test/CodeGen/X86/pr33715.ll
new file mode 100644
index 000000000000..15432cfdb512
--- /dev/null
+++ b/test/CodeGen/X86/pr33715.ll
@@ -0,0 +1,16 @@
+; Make sure we don't crash with a build vector of integer constants.
+; RUN: llc %s -o /dev/null
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @patatino() {
+ %tmp = insertelement <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>, i32 1, i32 2
+ %tmp1 = insertelement <4 x i32> %tmp, i32 1, i32 3
+ %tmp2 = icmp ne <4 x i32> %tmp1, zeroinitializer
+ %tmp3 = icmp slt <4 x i32> %tmp1, <i32 4, i32 4, i32 4, i32 4>
+ %tmp4 = or <4 x i1> %tmp2, %tmp3
+ %tmp5 = select <4 x i1> %tmp4, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+ %tmp6 = extractelement <4 x i32> %tmp5, i32 0
+ ret i32 %tmp6
+}
diff --git a/test/CodeGen/X86/rdrand-x86_64.ll b/test/CodeGen/X86/rdrand-x86_64.ll
new file mode 100644
index 000000000000..06f1136087bb
--- /dev/null
+++ b/test/CodeGen/X86/rdrand-x86_64.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdrnd | FileCheck %s
+
+declare {i64, i32} @llvm.x86.rdrand.64()
+
+define i32 @_rdrand64_step(i64* %random_val) {
+; CHECK-LABEL: _rdrand64_step:
+; CHECK: # BB#0:
+; CHECK-NEXT: rdrandq %rcx
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: cmovael %ecx, %eax
+; CHECK-NEXT: movq %rcx, (%rdi)
+; CHECK-NEXT: retq
+ %call = call {i64, i32} @llvm.x86.rdrand.64()
+ %randval = extractvalue {i64, i32} %call, 0
+ store i64 %randval, i64* %random_val
+ %isvalid = extractvalue {i64, i32} %call, 1
+ ret i32 %isvalid
+}
diff --git a/test/CodeGen/X86/rdrand.ll b/test/CodeGen/X86/rdrand.ll
index 107cde05a0e6..0638e0095282 100644
--- a/test/CodeGen/X86/rdrand.ll
+++ b/test/CodeGen/X86/rdrand.ll
@@ -1,66 +1,117 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdrnd | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=core-avx-i -mattr=+rdrnd | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdrnd | FileCheck %s --check-prefix=X64
+
declare {i16, i32} @llvm.x86.rdrand.16()
declare {i32, i32} @llvm.x86.rdrand.32()
-declare {i64, i32} @llvm.x86.rdrand.64()
define i32 @_rdrand16_step(i16* %random_val) {
+; X86-LABEL: _rdrand16_step:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: rdrandw %ax
+; X86-NEXT: movzwl %ax, %edx
+; X86-NEXT: movl $1, %eax
+; X86-NEXT: cmovael %edx, %eax
+; X86-NEXT: movw %dx, (%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: _rdrand16_step:
+; X64: # BB#0:
+; X64-NEXT: rdrandw %ax
+; X64-NEXT: movzwl %ax, %ecx
+; X64-NEXT: movl $1, %eax
+; X64-NEXT: cmovael %ecx, %eax
+; X64-NEXT: movw %cx, (%rdi)
+; X64-NEXT: retq
%call = call {i16, i32} @llvm.x86.rdrand.16()
%randval = extractvalue {i16, i32} %call, 0
store i16 %randval, i16* %random_val
%isvalid = extractvalue {i16, i32} %call, 1
ret i32 %isvalid
-; CHECK-LABEL: _rdrand16_step:
-; CHECK: rdrandw %ax
-; CHECK: movzwl %ax, %ecx
-; CHECK: movl $1, %eax
-; CHECK: cmovael %ecx, %eax
-; CHECK: movw %cx, (%r[[A0:di|cx]])
-; CHECK: ret
}
define i32 @_rdrand32_step(i32* %random_val) {
+; X86-LABEL: _rdrand32_step:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: rdrandl %edx
+; X86-NEXT: movl $1, %eax
+; X86-NEXT: cmovael %edx, %eax
+; X86-NEXT: movl %edx, (%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: _rdrand32_step:
+; X64: # BB#0:
+; X64-NEXT: rdrandl %ecx
+; X64-NEXT: movl $1, %eax
+; X64-NEXT: cmovael %ecx, %eax
+; X64-NEXT: movl %ecx, (%rdi)
+; X64-NEXT: retq
%call = call {i32, i32} @llvm.x86.rdrand.32()
%randval = extractvalue {i32, i32} %call, 0
store i32 %randval, i32* %random_val
%isvalid = extractvalue {i32, i32} %call, 1
ret i32 %isvalid
-; CHECK-LABEL: _rdrand32_step:
-; CHECK: rdrandl %e[[T0:[a-z]+]]
-; CHECK: movl $1, %eax
-; CHECK: cmovael %e[[T0]], %eax
-; CHECK: movl %e[[T0]], (%r[[A0]])
-; CHECK: ret
-}
-
-define i32 @_rdrand64_step(i64* %random_val) {
- %call = call {i64, i32} @llvm.x86.rdrand.64()
- %randval = extractvalue {i64, i32} %call, 0
- store i64 %randval, i64* %random_val
- %isvalid = extractvalue {i64, i32} %call, 1
- ret i32 %isvalid
-; CHECK-LABEL: _rdrand64_step:
-; CHECK: rdrandq %r[[T1:[a-z]+]]
-; CHECK: movl $1, %eax
-; CHECK: cmovael %e[[T1]], %eax
-; CHECK: movq %r[[T1]], (%r[[A0]])
-; CHECK: ret
}
; Check that MachineCSE doesn't eliminate duplicate rdrand instructions.
define i32 @CSE() nounwind {
+; X86-LABEL: CSE:
+; X86: # BB#0:
+; X86-NEXT: rdrandl %ecx
+; X86-NEXT: rdrandl %eax
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: CSE:
+; X64: # BB#0:
+; X64-NEXT: rdrandl %ecx
+; X64-NEXT: rdrandl %eax
+; X64-NEXT: addl %ecx, %eax
+; X64-NEXT: retq
%rand1 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
%v1 = extractvalue { i32, i32 } %rand1, 0
%rand2 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
%v2 = extractvalue { i32, i32 } %rand2, 0
%add = add i32 %v2, %v1
ret i32 %add
-; CHECK-LABEL: CSE:
-; CHECK: rdrandl
-; CHECK: rdrandl
}
; Check that MachineLICM doesn't hoist rdrand instructions.
define void @loop(i32* %p, i32 %n) nounwind {
+; X86-LABEL: loop:
+; X86: # BB#0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: je .LBB3_3
+; X86-NEXT: # BB#1: # %while.body.preheader
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: .p2align 4, 0x90
+; X86-NEXT: .LBB3_2: # %while.body
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: rdrandl %edx
+; X86-NEXT: movl %edx, (%ecx)
+; X86-NEXT: leal 4(%ecx), %ecx
+; X86-NEXT: decl %eax
+; X86-NEXT: jne .LBB3_2
+; X86-NEXT: .LBB3_3: # %while.end
+; X86-NEXT: retl
+;
+; X64-LABEL: loop:
+; X64: # BB#0: # %entry
+; X64-NEXT: testl %esi, %esi
+; X64-NEXT: je .LBB3_2
+; X64-NEXT: .p2align 4, 0x90
+; X64-NEXT: .LBB3_1: # %while.body
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: rdrandl %eax
+; X64-NEXT: movl %eax, (%rdi)
+; X64-NEXT: leaq 4(%rdi), %rdi
+; X64-NEXT: decl %esi
+; X64-NEXT: jne .LBB3_1
+; X64-NEXT: .LBB3_2: # %while.end
+; X64-NEXT: retq
entry:
%tobool1 = icmp eq i32 %n, 0
br i1 %tobool1, label %while.end, label %while.body
@@ -78,8 +129,4 @@ while.body: ; preds = %entry, %while.body
while.end: ; preds = %while.body, %entry
ret void
-; CHECK-LABEL: loop:
-; CHECK-NOT: rdrandl
-; CHECK: This Inner Loop Header: Depth=1
-; CHECK: rdrandl
}
diff --git a/test/CodeGen/X86/rdseed-x86_64.ll b/test/CodeGen/X86/rdseed-x86_64.ll
new file mode 100644
index 000000000000..b0d9748dd6ae
--- /dev/null
+++ b/test/CodeGen/X86/rdseed-x86_64.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdseed | FileCheck %s
+
+declare {i64, i32} @llvm.x86.rdseed.64()
+
+define i32 @_rdseed64_step(i64* %random_val) {
+; CHECK-LABEL: _rdseed64_step:
+; CHECK: # BB#0:
+; CHECK-NEXT: rdseedq %rcx
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: cmovael %ecx, %eax
+; CHECK-NEXT: movq %rcx, (%rdi)
+; CHECK-NEXT: retq
+ %call = call {i64, i32} @llvm.x86.rdseed.64()
+ %randval = extractvalue {i64, i32} %call, 0
+ store i64 %randval, i64* %random_val
+ %isvalid = extractvalue {i64, i32} %call, 1
+ ret i32 %isvalid
+}
diff --git a/test/CodeGen/X86/rdseed.ll b/test/CodeGen/X86/rdseed.ll
index c219b4ad27ec..b22e3e7ceac0 100644
--- a/test/CodeGen/X86/rdseed.ll
+++ b/test/CodeGen/X86/rdseed.ll
@@ -1,48 +1,56 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdseed | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=core-avx-i -mattr=+rdseed | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdseed | FileCheck %s --check-prefix=X64
declare {i16, i32} @llvm.x86.rdseed.16()
declare {i32, i32} @llvm.x86.rdseed.32()
-declare {i64, i32} @llvm.x86.rdseed.64()
define i32 @_rdseed16_step(i16* %random_val) {
+; X86-LABEL: _rdseed16_step:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: rdseedw %ax
+; X86-NEXT: movzwl %ax, %edx
+; X86-NEXT: movl $1, %eax
+; X86-NEXT: cmovael %edx, %eax
+; X86-NEXT: movw %dx, (%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: _rdseed16_step:
+; X64: # BB#0:
+; X64-NEXT: rdseedw %ax
+; X64-NEXT: movzwl %ax, %ecx
+; X64-NEXT: movl $1, %eax
+; X64-NEXT: cmovael %ecx, %eax
+; X64-NEXT: movw %cx, (%rdi)
+; X64-NEXT: retq
%call = call {i16, i32} @llvm.x86.rdseed.16()
%randval = extractvalue {i16, i32} %call, 0
store i16 %randval, i16* %random_val
%isvalid = extractvalue {i16, i32} %call, 1
ret i32 %isvalid
-; CHECK-LABEL: _rdseed16_step:
-; CHECK: rdseedw %ax
-; CHECK: movzwl %ax, %ecx
-; CHECK: movl $1, %eax
-; CHECK: cmovael %ecx, %eax
-; CHECK: movw %cx, (%r[[A0:di|cx]])
-; CHECK: ret
}
define i32 @_rdseed32_step(i32* %random_val) {
+; X86-LABEL: _rdseed32_step:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: rdseedl %edx
+; X86-NEXT: movl $1, %eax
+; X86-NEXT: cmovael %edx, %eax
+; X86-NEXT: movl %edx, (%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: _rdseed32_step:
+; X64: # BB#0:
+; X64-NEXT: rdseedl %ecx
+; X64-NEXT: movl $1, %eax
+; X64-NEXT: cmovael %ecx, %eax
+; X64-NEXT: movl %ecx, (%rdi)
+; X64-NEXT: retq
%call = call {i32, i32} @llvm.x86.rdseed.32()
%randval = extractvalue {i32, i32} %call, 0
store i32 %randval, i32* %random_val
%isvalid = extractvalue {i32, i32} %call, 1
ret i32 %isvalid
-; CHECK-LABEL: _rdseed32_step:
-; CHECK: rdseedl %e[[T0:[a-z]+]]
-; CHECK: movl $1, %eax
-; CHECK: cmovael %e[[T0]], %eax
-; CHECK: movl %e[[T0]], (%r[[A0]])
-; CHECK: ret
-}
-
-define i32 @_rdseed64_step(i64* %random_val) {
- %call = call {i64, i32} @llvm.x86.rdseed.64()
- %randval = extractvalue {i64, i32} %call, 0
- store i64 %randval, i64* %random_val
- %isvalid = extractvalue {i64, i32} %call, 1
- ret i32 %isvalid
-; CHECK-LABEL: _rdseed64_step:
-; CHECK: rdseedq %r[[T1:[a-z]+]]
-; CHECK: movl $1, %eax
-; CHECK: cmovael %e[[T1]], %eax
-; CHECK: movq %r[[T1]], (%r[[A0]])
-; CHECK: ret
}
diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll
index 16e261bf3c5e..02a968c6f27d 100644
--- a/test/CodeGen/X86/recip-fastmath.ll
+++ b/test/CodeGen/X86/recip-fastmath.ll
@@ -45,9 +45,9 @@ define float @f32_no_estimate(float %x) #0 {
;
; SANDY-LABEL: f32_no_estimate:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_no_estimate:
; HASWELL: # BB#0:
@@ -113,11 +113,11 @@ define float @f32_one_step(float %x) #1 {
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_one_step:
; HASWELL: # BB#0:
@@ -207,7 +207,7 @@ define float @f32_two_step(float %x) #2 {
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -215,7 +215,7 @@ define float @f32_two_step(float %x) #2 {
; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_two_step:
; HASWELL: # BB#0:
@@ -284,25 +284,25 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
;
; SANDY-LABEL: v4f32_no_estimate:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
-; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
+; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_no_estimate:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 # sched: [4:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [4:0.50]
; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
; HASWELL-NEXT: retq # sched: [1:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_no_estimate:
; HASWELL-NO-FMA: # BB#0:
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0
; HASWELL-NO-FMA-NEXT: retq
;
; AVX512-LABEL: v4f32_no_estimate:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 # sched: [4:0.50]
+; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [4:0.50]
; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
; AVX512-NEXT: retq # sched: [1:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
@@ -350,18 +350,18 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
;
; SANDY-LABEL: v4f32_one_step:
; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [7:3.00]
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_one_step:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
; HASWELL-NEXT: retq # sched: [1:1.00]
@@ -370,7 +370,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0
; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0
; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0
@@ -379,7 +379,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
; KNL-LABEL: v4f32_one_step:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
; KNL-NEXT: retq # sched: [1:1.00]
@@ -453,9 +453,9 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
;
; SANDY-LABEL: v4f32_two_step:
; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [7:3.00]
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -463,12 +463,12 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_two_step:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
@@ -480,7 +480,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm3
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1]
; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2
; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2
; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1
@@ -493,7 +493,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; KNL-LABEL: v4f32_two_step:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
@@ -504,7 +504,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; SKX-LABEL: v4f32_two_step:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %xmm0, %xmm1
-; SKX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
@@ -541,30 +541,30 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
; BTVER2-LABEL: v8f32_no_estimate:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
-; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [19:19.00]
+; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [38:38.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_no_estimate:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
-; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
+; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [29:3.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_no_estimate:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 # sched: [5:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [19:2.00]
; HASWELL-NEXT: retq # sched: [1:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_no_estimate:
; HASWELL-NO-FMA: # BB#0:
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0
; HASWELL-NO-FMA-NEXT: retq
;
; AVX512-LABEL: v8f32_no_estimate:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 # sched: [5:1.00]
+; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [19:2.00]
; AVX512-NEXT: retq # sched: [1:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
@@ -610,27 +610,27 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
; BTVER2-LABEL: v8f32_one_step:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
-; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:1.00]
-; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00]
+; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_one_step:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00]
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_one_step:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
; HASWELL-NEXT: retq # sched: [1:1.00]
@@ -639,7 +639,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1
; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0
; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0
; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0
@@ -648,7 +648,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
; KNL-LABEL: v8f32_one_step:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
; KNL-NEXT: retq # sched: [1:1.00]
@@ -722,22 +722,22 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; BTVER2-LABEL: v8f32_two_step:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
-; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:1.00]
-; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:1.00]
-; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [2:1.00]
-; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00]
+; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00]
+; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [2:2.00]
+; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_two_step:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00]
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
@@ -745,12 +745,12 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_two_step:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
@@ -762,7 +762,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1
; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm3
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2
; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2
; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1
@@ -775,7 +775,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; KNL-LABEL: v8f32_two_step:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
@@ -786,7 +786,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; SKX-LABEL: v8f32_two_step:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %ymm0, %ymm1
-; SKX-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
diff --git a/test/CodeGen/X86/recip-fastmath2.ll b/test/CodeGen/X86/recip-fastmath2.ll
index 440a6f0bef13..c82eab84757f 100644
--- a/test/CodeGen/X86/recip-fastmath2.ll
+++ b/test/CodeGen/X86/recip-fastmath2.ll
@@ -39,8 +39,8 @@ define float @f32_no_step_2(float %x) #3 {
; SANDY-LABEL: f32_no_step_2:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_no_step_2:
; HASWELL: # BB#0:
@@ -110,12 +110,12 @@ define float @f32_one_step_2(float %x) #1 {
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_one_step_2:
; HASWELL: # BB#0:
@@ -198,13 +198,13 @@ define float @f32_one_step_2_divs(float %x) #1 {
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:1.00]
+; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_one_step_2_divs:
; HASWELL: # BB#0:
@@ -305,7 +305,7 @@ define float @f32_two_step_2(float %x) #2 {
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -313,8 +313,8 @@ define float @f32_two_step_2(float %x) #2 {
; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_two_step_2:
; HASWELL: # BB#0:
@@ -403,19 +403,19 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
;
; SANDY-LABEL: v4f32_one_step2:
; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [7:3.00]
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_one_step2:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
@@ -425,7 +425,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
@@ -435,7 +435,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
; KNL-LABEL: v4f32_one_step2:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
@@ -501,20 +501,20 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
;
; SANDY-LABEL: v4f32_one_step_2_divs:
; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [7:3.00]
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_one_step_2_divs:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
@@ -525,7 +525,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
@@ -536,7 +536,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
; KNL-LABEL: v4f32_one_step_2_divs:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
@@ -619,9 +619,9 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
;
; SANDY-LABEL: v4f32_two_step2:
; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [7:3.00]
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -629,13 +629,13 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_two_step2:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
@@ -648,7 +648,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm3 # sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] sched: [4:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -662,7 +662,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; KNL-LABEL: v4f32_two_step2:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
@@ -674,7 +674,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; SKX-LABEL: v4f32_two_step2:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %xmm0, %xmm1
-; SKX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
@@ -729,29 +729,29 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
; BTVER2-LABEL: v8f32_one_step2:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
-; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:1.00]
-; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:1.00]
+; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00]
+; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_one_step2:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00]
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_one_step2:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
@@ -761,7 +761,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
@@ -771,7 +771,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
; KNL-LABEL: v8f32_one_step2:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
@@ -835,31 +835,31 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
; BTVER2-LABEL: v8f32_one_step_2_divs:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
-; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:1.00]
-; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [7:1.00]
-; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00]
+; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00]
+; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [7:2.00]
+; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_one_step_2_divs:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00]
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_one_step_2_divs:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
@@ -870,7 +870,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
@@ -881,7 +881,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
; KNL-LABEL: v8f32_one_step_2_divs:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
@@ -964,23 +964,23 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; BTVER2-LABEL: v8f32_two_step2:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
-; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:1.00]
-; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:1.00]
-; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [2:1.00]
-; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:1.00]
+; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00]
+; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00]
+; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [2:2.00]
+; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_two_step2:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00]
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
@@ -988,13 +988,13 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_two_step2:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
@@ -1007,7 +1007,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm3 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
@@ -1021,7 +1021,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; KNL-LABEL: v8f32_two_step2:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
@@ -1033,7 +1033,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; SKX-LABEL: v8f32_two_step2:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %ymm0, %ymm1
-; SKX-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
+; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
@@ -1064,13 +1064,13 @@ define <8 x float> @v8f32_no_step(<8 x float> %x) #3 {
;
; BTVER2-LABEL: v8f32_no_step:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:1.00]
+; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_no_step:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_no_step:
; HASWELL: # BB#0:
@@ -1118,15 +1118,15 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
;
; BTVER2-LABEL: v8f32_no_step2:
; BTVER2: # BB#0:
-; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:1.00]
+; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_no_step2:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_no_step2:
; HASWELL: # BB#0:
diff --git a/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll b/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll
index ba8ff1bc1819..3bb14c4b1cd8 100644
--- a/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll
+++ b/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -o - -mtriple=x86_64-apple-macosx | FileCheck %s
+; RUN: llc -lsr-filter-same-scaled-reg=false < %s -o - -mtriple=x86_64-apple-macosx | FileCheck %s
; Test case for the recoloring of broken hints.
; This is tricky to have something reasonably small to kick this optimization since
; it requires that spliting and spilling occur.
diff --git a/test/CodeGen/X86/rotate4.ll b/test/CodeGen/X86/rotate4.ll
index 56a7d3285056..c7117be91ab4 100644
--- a/test/CodeGen/X86/rotate4.ll
+++ b/test/CodeGen/X86/rotate4.ll
@@ -1,17 +1,20 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=generic | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
; Check that we recognize this idiom for rotation too:
; a << (b & (OpSize-1)) | a >> ((0 - b) & (OpSize-1))
define i32 @rotate_left_32(i32 %a, i32 %b) {
; CHECK-LABEL: rotate_left_32:
-; CHECK-NOT: and
-; CHECK: roll
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: roll %cl, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
%and = and i32 %b, 31
%shl = shl i32 %a, %and
- %0 = sub i32 0, %b
- %and3 = and i32 %0, 31
+ %t0 = sub i32 0, %b
+ %and3 = and i32 %t0, 31
%shr = lshr i32 %a, %and3
%or = or i32 %shl, %shr
ret i32 %or
@@ -19,13 +22,15 @@ entry:
define i32 @rotate_right_32(i32 %a, i32 %b) {
; CHECK-LABEL: rotate_right_32:
-; CHECK-NOT: and
-; CHECK: rorl
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: rorl %cl, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
%and = and i32 %b, 31
%shl = lshr i32 %a, %and
- %0 = sub i32 0, %b
- %and3 = and i32 %0, 31
+ %t0 = sub i32 0, %b
+ %and3 = and i32 %t0, 31
%shr = shl i32 %a, %and3
%or = or i32 %shl, %shr
ret i32 %or
@@ -33,13 +38,15 @@ entry:
define i64 @rotate_left_64(i64 %a, i64 %b) {
; CHECK-LABEL: rotate_left_64:
-; CHECK-NOT: and
-; CHECK: rolq
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: rolq %cl, %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%and = and i64 %b, 63
%shl = shl i64 %a, %and
- %0 = sub i64 0, %b
- %and3 = and i64 %0, 63
+ %t0 = sub i64 0, %b
+ %and3 = and i64 %t0, 63
%shr = lshr i64 %a, %and3
%or = or i64 %shl, %shr
ret i64 %or
@@ -47,13 +54,15 @@ entry:
define i64 @rotate_right_64(i64 %a, i64 %b) {
; CHECK-LABEL: rotate_right_64:
-; CHECK-NOT: and
-; CHECK: rorq
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: rorq %cl, %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%and = and i64 %b, 63
%shl = lshr i64 %a, %and
- %0 = sub i64 0, %b
- %and3 = and i64 %0, 63
+ %t0 = sub i64 0, %b
+ %and3 = and i64 %t0, 63
%shr = shl i64 %a, %and3
%or = or i64 %shl, %shr
ret i64 %or
@@ -63,16 +72,15 @@ entry:
define void @rotate_left_m32(i32 *%pa, i32 %b) {
; CHECK-LABEL: rotate_left_m32:
-; CHECK-NOT: and
-; CHECK: roll
-; no store:
-; CHECK-NOT: mov
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: roll %cl, (%rdi)
+; CHECK-NEXT: retq
%a = load i32, i32* %pa, align 16
%and = and i32 %b, 31
%shl = shl i32 %a, %and
- %0 = sub i32 0, %b
- %and3 = and i32 %0, 31
+ %t0 = sub i32 0, %b
+ %and3 = and i32 %t0, 31
%shr = lshr i32 %a, %and3
%or = or i32 %shl, %shr
store i32 %or, i32* %pa, align 32
@@ -81,16 +89,15 @@ entry:
define void @rotate_right_m32(i32 *%pa, i32 %b) {
; CHECK-LABEL: rotate_right_m32:
-; CHECK-NOT: and
-; CHECK: rorl
-; no store:
-; CHECK-NOT: mov
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: rorl %cl, (%rdi)
+; CHECK-NEXT: retq
%a = load i32, i32* %pa, align 16
%and = and i32 %b, 31
%shl = lshr i32 %a, %and
- %0 = sub i32 0, %b
- %and3 = and i32 %0, 31
+ %t0 = sub i32 0, %b
+ %and3 = and i32 %t0, 31
%shr = shl i32 %a, %and3
%or = or i32 %shl, %shr
store i32 %or, i32* %pa, align 32
@@ -99,16 +106,15 @@ entry:
define void @rotate_left_m64(i64 *%pa, i64 %b) {
; CHECK-LABEL: rotate_left_m64:
-; CHECK-NOT: and
-; CHECK: rolq
-; no store:
-; CHECK-NOT: mov
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: rolq %cl, (%rdi)
+; CHECK-NEXT: retq
%a = load i64, i64* %pa, align 16
%and = and i64 %b, 63
%shl = shl i64 %a, %and
- %0 = sub i64 0, %b
- %and3 = and i64 %0, 63
+ %t0 = sub i64 0, %b
+ %and3 = and i64 %t0, 63
%shr = lshr i64 %a, %and3
%or = or i64 %shl, %shr
store i64 %or, i64* %pa, align 64
@@ -117,18 +123,18 @@ entry:
define void @rotate_right_m64(i64 *%pa, i64 %b) {
; CHECK-LABEL: rotate_right_m64:
-; CHECK-NOT: and
-; CHECK: rorq
-; no store:
-; CHECK-NOT: mov
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: rorq %cl, (%rdi)
+; CHECK-NEXT: retq
%a = load i64, i64* %pa, align 16
%and = and i64 %b, 63
%shl = lshr i64 %a, %and
- %0 = sub i64 0, %b
- %and3 = and i64 %0, 63
+ %t0 = sub i64 0, %b
+ %and3 = and i64 %t0, 63
%shr = shl i64 %a, %and3
%or = or i64 %shl, %shr
store i64 %or, i64* %pa, align 64
ret void
}
+
diff --git a/test/CodeGen/X86/sbb.ll b/test/CodeGen/X86/sbb.ll
index 414780b2d4e6..b6e8ebf6ed06 100644
--- a/test/CodeGen/X86/sbb.ll
+++ b/test/CodeGen/X86/sbb.ll
@@ -146,10 +146,8 @@ define i32 @ugt_select_neg1_or_0(i32 %x, i32 %y) nounwind {
define i32 @uge_select_0_or_neg1(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: uge_select_0_or_neg1:
; CHECK: # BB#0:
-; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: cmpl %esi, %edi
-; CHECK-NEXT: setae %al
-; CHECK-NEXT: decl %eax
+; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: retq
%cmp = icmp uge i32 %x, %y
%ext = zext i1 %cmp to i32
@@ -163,10 +161,8 @@ define i32 @uge_select_0_or_neg1(i32 %x, i32 %y) nounwind {
define i32 @ule_select_0_or_neg1(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: ule_select_0_or_neg1:
; CHECK: # BB#0:
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: cmpl %edi, %esi
-; CHECK-NEXT: setbe %al
-; CHECK-NEXT: decl %eax
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: retq
%cmp = icmp ule i32 %y, %x
%ext = zext i1 %cmp to i32
@@ -180,10 +176,8 @@ define i32 @ule_select_0_or_neg1(i32 %x, i32 %y) nounwind {
define i32 @uge_select_0_or_neg1_sub(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: uge_select_0_or_neg1_sub:
; CHECK: # BB#0:
-; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: cmpl %esi, %edi
-; CHECK-NEXT: setae %al
-; CHECK-NEXT: decl %eax
+; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: retq
%cmp = icmp uge i32 %x, %y
%ext = zext i1 %cmp to i32
@@ -191,6 +185,38 @@ define i32 @uge_select_0_or_neg1_sub(i32 %x, i32 %y) nounwind {
ret i32 %sub
}
+; Check more sub-from-zero patterns.
+; (X >u Y) ? -1 : 0 --> cmp, sbb
+
+define i64 @ugt_select_neg1_or_0_sub(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: ugt_select_neg1_or_0_sub:
+; CHECK: # BB#0:
+; CHECK-NEXT: cmpq %rdi, %rsi
+; CHECK-NEXT: sbbq %rax, %rax
+; CHECK-NEXT: retq
+ %cmp = icmp ugt i64 %x, %y
+ %zext = zext i1 %cmp to i64
+ %sub = sub i64 0, %zext
+ ret i64 %sub
+}
+
+; Swap the predicate and compare operands:
+; (Y <u X) ? -1 : 0 --> cmp, sbb
+
+define i16 @ult_select_neg1_or_0_sub(i16 %x, i16 %y) nounwind {
+; CHECK-LABEL: ult_select_neg1_or_0_sub:
+; CHECK: # BB#0:
+; CHECK-NEXT: cmpw %di, %si
+; CHECK-NEXT: sbbw %ax, %ax
+; CHECK-NEXT: retq
+ %cmp = icmp ult i16 %y, %x
+ %zext = zext i1 %cmp to i16
+ %sub = sub i16 0, %zext
+ ret i16 %sub
+}
+
+
+
; Make sure we're creating nodes with the right value types. This would crash.
; https://bugs.llvm.org/show_bug.cgi?id=33560
diff --git a/test/CodeGen/X86/select_const.ll b/test/CodeGen/X86/select_const.ll
index a97e7c299e73..0eb9bf46ffd1 100644
--- a/test/CodeGen/X86/select_const.ll
+++ b/test/CodeGen/X86/select_const.ll
@@ -205,6 +205,111 @@ define i32 @select_C_Cplus1_signext(i1 signext %cond) {
ret i32 %sel
}
+; If the constants differ by a small multiplier, use LEA.
+; select Cond, C1, C2 --> add (mul (zext Cond), C1-C2), C2 --> LEA C2(Cond * (C1-C2))
+
+define i32 @select_lea_2(i1 zeroext %cond) {
+; CHECK-LABEL: select_lea_2:
+; CHECK: # BB#0:
+; CHECK-NEXT: testb %dil, %dil
+; CHECK-NEXT: movl $-1, %ecx
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %sel = select i1 %cond, i32 -1, i32 1
+ ret i32 %sel
+}
+
+define i64 @select_lea_3(i1 zeroext %cond) {
+; CHECK-LABEL: select_lea_3:
+; CHECK: # BB#0:
+; CHECK-NEXT: testb %dil, %dil
+; CHECK-NEXT: movl $1, %ecx
+; CHECK-NEXT: movq $-2, %rax
+; CHECK-NEXT: cmoveq %rcx, %rax
+; CHECK-NEXT: retq
+ %sel = select i1 %cond, i64 -2, i64 1
+ ret i64 %sel
+}
+
+define i32 @select_lea_5(i1 zeroext %cond) {
+; CHECK-LABEL: select_lea_5:
+; CHECK: # BB#0:
+; CHECK-NEXT: testb %dil, %dil
+; CHECK-NEXT: movl $-2, %ecx
+; CHECK-NEXT: movl $3, %eax
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %sel = select i1 %cond, i32 -2, i32 3
+ ret i32 %sel
+}
+
+define i64 @select_lea_9(i1 zeroext %cond) {
+; CHECK-LABEL: select_lea_9:
+; CHECK: # BB#0:
+; CHECK-NEXT: testb %dil, %dil
+; CHECK-NEXT: movl $2, %ecx
+; CHECK-NEXT: movq $-7, %rax
+; CHECK-NEXT: cmoveq %rcx, %rax
+; CHECK-NEXT: retq
+ %sel = select i1 %cond, i64 -7, i64 2
+ ret i64 %sel
+}
+
+
+; If the constants differ by a large power-of-2, that can be a shift of the difference plus the smaller constant.
+; select Cond, C1, C2 --> add (mul (zext Cond), C1-C2), C2
+
+define i8 @select_pow2_diff(i1 zeroext %cond) {
+; CHECK-LABEL: select_pow2_diff:
+; CHECK: # BB#0:
+; CHECK-NEXT: testb %dil, %dil
+; CHECK-NEXT: movb $19, %al
+; CHECK-NEXT: jne .LBB22_2
+; CHECK-NEXT: # BB#1:
+; CHECK-NEXT: movb $3, %al
+; CHECK-NEXT: .LBB22_2:
+; CHECK-NEXT: retq
+ %sel = select i1 %cond, i8 19, i8 3
+ ret i8 %sel
+}
+
+define i16 @select_pow2_diff_invert(i1 zeroext %cond) {
+; CHECK-LABEL: select_pow2_diff_invert:
+; CHECK: # BB#0:
+; CHECK-NEXT: testb %dil, %dil
+; CHECK-NEXT: movw $7, %cx
+; CHECK-NEXT: movw $71, %ax
+; CHECK-NEXT: cmovnew %cx, %ax
+; CHECK-NEXT: retq
+ %sel = select i1 %cond, i16 7, i16 71
+ ret i16 %sel
+}
+
+define i32 @select_pow2_diff_neg(i1 zeroext %cond) {
+; CHECK-LABEL: select_pow2_diff_neg:
+; CHECK: # BB#0:
+; CHECK-NEXT: testb %dil, %dil
+; CHECK-NEXT: movl $-9, %ecx
+; CHECK-NEXT: movl $-25, %eax
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %sel = select i1 %cond, i32 -9, i32 -25
+ ret i32 %sel
+}
+
+define i64 @select_pow2_diff_neg_invert(i1 zeroext %cond) {
+; CHECK-LABEL: select_pow2_diff_neg_invert:
+; CHECK: # BB#0:
+; CHECK-NEXT: testb %dil, %dil
+; CHECK-NEXT: movl $29, %ecx
+; CHECK-NEXT: movq $-99, %rax
+; CHECK-NEXT: cmoveq %rcx, %rax
+; CHECK-NEXT: retq
+ %sel = select i1 %cond, i64 -99, i64 29
+ ret i64 %sel
+}
+
; In general, select of 2 constants could be:
; select Cond, C1, C2 --> add (mul (zext Cond), C1-C2), C2 --> add (and (sext Cond), C1-C2), C2
@@ -263,11 +368,11 @@ define <4 x i32> @sel_constants_add_constant_vec(i1 %cond) {
; CHECK-LABEL: sel_constants_add_constant_vec:
; CHECK: # BB#0:
; CHECK-NEXT: testb $1, %dil
-; CHECK-NEXT: jne .LBB22_1
+; CHECK-NEXT: jne .LBB30_1
; CHECK-NEXT: # BB#2:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [12,13,14,15]
; CHECK-NEXT: retq
-; CHECK-NEXT: .LBB22_1:
+; CHECK-NEXT: .LBB30_1:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [4294967293,14,4,4]
; CHECK-NEXT: retq
%sel = select i1 %cond, <4 x i32> <i32 -4, i32 12, i32 1, i32 0>, <4 x i32> <i32 11, i32 11, i32 11, i32 11>
@@ -279,11 +384,11 @@ define <2 x double> @sel_constants_fmul_constant_vec(i1 %cond) {
; CHECK-LABEL: sel_constants_fmul_constant_vec:
; CHECK: # BB#0:
; CHECK-NEXT: testb $1, %dil
-; CHECK-NEXT: jne .LBB23_1
+; CHECK-NEXT: jne .LBB31_1
; CHECK-NEXT: # BB#2:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1.188300e+02,3.454000e+01]
; CHECK-NEXT: retq
-; CHECK-NEXT: .LBB23_1:
+; CHECK-NEXT: .LBB31_1:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [-2.040000e+01,3.768000e+01]
; CHECK-NEXT: retq
%sel = select i1 %cond, <2 x double> <double -4.0, double 12.0>, <2 x double> <double 23.3, double 11.0>
diff --git a/test/CodeGen/X86/shift-codegen.ll b/test/CodeGen/X86/shift-codegen.ll
index 7d52bdeb9e3a..295a55d86a00 100644
--- a/test/CodeGen/X86/shift-codegen.ll
+++ b/test/CodeGen/X86/shift-codegen.ll
@@ -1,38 +1,36 @@
-; RUN: llc < %s -relocation-model=static -march=x86 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -relocation-model=static -mtriple=i686-unknown-unknown | FileCheck %s
; This should produce two shll instructions, not any lea's.
target triple = "i686-apple-darwin8"
-@Y = weak global i32 0 ; <i32*> [#uses=1]
-@X = weak global i32 0 ; <i32*> [#uses=2]
-
+@Y = weak global i32 0
+@X = weak global i32 0
define void @fn1() {
; CHECK-LABEL: fn1:
-; CHECK-NOT: ret
-; CHECK-NOT: lea
-; CHECK: shll $3
-; CHECK-NOT: lea
-; CHECK: ret
-
- %tmp = load i32, i32* @Y ; <i32> [#uses=1]
- %tmp1 = shl i32 %tmp, 3 ; <i32> [#uses=1]
- %tmp2 = load i32, i32* @X ; <i32> [#uses=1]
- %tmp3 = or i32 %tmp1, %tmp2 ; <i32> [#uses=1]
+; CHECK: # BB#0:
+; CHECK-NEXT: movl Y, %eax
+; CHECK-NEXT: shll $3, %eax
+; CHECK-NEXT: orl %eax, X
+; CHECK-NEXT: retl
+ %tmp = load i32, i32* @Y
+ %tmp1 = shl i32 %tmp, 3
+ %tmp2 = load i32, i32* @X
+ %tmp3 = or i32 %tmp1, %tmp2
store i32 %tmp3, i32* @X
ret void
}
define i32 @fn2(i32 %X, i32 %Y) {
; CHECK-LABEL: fn2:
-; CHECK-NOT: ret
-; CHECK-NOT: lea
-; CHECK: shll $3
-; CHECK-NOT: lea
-; CHECK: ret
-
- %tmp2 = shl i32 %Y, 3 ; <i32> [#uses=1]
- %tmp4 = or i32 %tmp2, %X ; <i32> [#uses=1]
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: shll $3, %eax
+; CHECK-NEXT: orl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: retl
+ %tmp2 = shl i32 %Y, 3
+ %tmp4 = or i32 %tmp2, %X
ret i32 %tmp4
}
diff --git a/test/CodeGen/X86/shift-folding.ll b/test/CodeGen/X86/shift-folding.ll
index 698878708977..76cf4a41a6cb 100644
--- a/test/CodeGen/X86/shift-folding.ll
+++ b/test/CodeGen/X86/shift-folding.ll
@@ -1,12 +1,13 @@
-; RUN: llc < %s -march=x86 -verify-coalescing | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -verify-coalescing | FileCheck %s
define i32* @test1(i32* %P, i32 %X) {
; CHECK-LABEL: test1:
-; CHECK-NOT: shrl
-; CHECK-NOT: shll
-; CHECK: ret
-
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: andl $-4, %eax
+; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: retl
%Y = lshr i32 %X, 2
%gep.upgrd.1 = zext i32 %Y to i64
%P2 = getelementptr i32, i32* %P, i64 %gep.upgrd.1
@@ -15,11 +16,11 @@ entry:
define i32* @test2(i32* %P, i32 %X) {
; CHECK-LABEL: test2:
-; CHECK: shll $4
-; CHECK-NOT: shll
-; CHECK: ret
-
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: shll $4, %eax
+; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: retl
%Y = shl i32 %X, 2
%gep.upgrd.2 = zext i32 %Y to i64
%P2 = getelementptr i32, i32* %P, i64 %gep.upgrd.2
@@ -28,11 +29,11 @@ entry:
define i32* @test3(i32* %P, i32 %X) {
; CHECK-LABEL: test3:
-; CHECK-NOT: shrl
-; CHECK-NOT: shll
-; CHECK: ret
-
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: andl $-4, %eax
+; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: retl
%Y = ashr i32 %X, 2
%P2 = getelementptr i32, i32* %P, i32 %Y
ret i32* %P2
@@ -40,25 +41,27 @@ entry:
define fastcc i32 @test4(i32* %d) {
; CHECK-LABEL: test4:
-; CHECK-NOT: shrl
-; CHECK: ret
-
-entry:
+; CHECK: # BB#0:
+; CHECK-NEXT: movzbl 3(%ecx), %eax
+; CHECK-NEXT: retl
%tmp4 = load i32, i32* %d
%tmp512 = lshr i32 %tmp4, 24
ret i32 %tmp512
}
-define i64 @test5(i16 %i, i32* %arr) {
; Ensure that we don't fold away shifts which have multiple uses, as they are
; just re-introduced for the second use.
-; CHECK-LABEL: test5:
-; CHECK-NOT: shrl
-; CHECK: shrl $11
-; CHECK-NOT: shrl
-; CHECK: ret
-entry:
+define i64 @test5(i16 %i, i32* %arr) {
+; CHECK-LABEL: test5:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: shrl $11, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: addl (%ecx,%eax,4), %eax
+; CHECK-NEXT: setb %dl
+; CHECK-NEXT: retl
%i.zext = zext i16 %i to i32
%index = lshr i32 %i.zext, 11
%index.zext = zext i32 %index to i64
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index ee8921c41a06..c84869433546 100644
--- a/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -37,24 +37,16 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
;
; AVX512F-LABEL: shuffle_v32i8_to_v16i8:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: vpmovsxwd (%rdi), %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_to_v16i8:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpmovsxwd (%rdi), %zmm0
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
@@ -62,11 +54,7 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -74,12 +62,7 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vmovdqu {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
@@ -166,11 +149,8 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX2-LABEL: shuffle_v16i16_to_v8i16:
; AVX2: # BB#0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -178,11 +158,7 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -190,42 +166,22 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
@@ -293,48 +249,50 @@ define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
}
define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
-; AVX-LABEL: shuffle_v8i32_to_v4i32:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps (%rdi), %ymm0
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX-NEXT: vmovaps %xmm0, (%rsi)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v8i32_to_v4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT: vmovaps %xmm0, (%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8i32_to_v4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuffle_v8i32_to_v4i32:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovaps (%rdi), %ymm0
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX512F-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i32_to_v4i32:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX512VL-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v8i32_to_v4i32:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovaps (%rdi), %ymm0
-; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX512BW-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX512BWVL-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %L
@@ -413,11 +371,9 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX2-LABEL: shuffle_v32i8_to_v8i8:
; AVX2: # BB#0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vmovq %xmm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -425,11 +381,8 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -437,39 +390,23 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
@@ -542,26 +479,19 @@ define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX1-LABEL: shuffle_v16i16_to_v4i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vmovq %xmm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_to_v4i16:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-NEXT: vmovq %xmm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -569,12 +499,8 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -582,31 +508,23 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
@@ -676,24 +594,19 @@ define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX1-LABEL: shuffle_v32i8_to_v4i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vmovd %xmm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_to_v4i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vmovd %xmm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -701,11 +614,8 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovd %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -713,30 +623,23 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
@@ -802,3 +705,73 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
store <4 x i8> %strided.vec, <4 x i8>* %S
ret void
}
+
+; In this case not all elements are collected from the same source vector, so
+; the resulting BUILD_VECTOR should not be combined to a truncate.
+define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
+; AVX1-LABEL: negative:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14]
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: negative:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: negative:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: negative:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: negative:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: negative:
+; AVX512BWVL: # BB#0:
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
+; AVX512BWVL-NEXT: movl $65537, %eax # imm = 0x10001
+; AVX512BWVL-NEXT: kmovd %eax, %k1
+; AVX512BWVL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
+; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX512BWVL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %strided.vec = shufflevector <32 x i8> %v, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+ %w0 = extractelement <32 x i8> %w, i32 0
+ %merged = insertelement <16 x i8> %strided.vec, i8 %w0, i32 0
+ ret <16 x i8> %merged
+}
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index a3ba58975800..69155b5cc565 100644
--- a/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -11,49 +11,37 @@
define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
; AVX512F-LABEL: shuffle_v64i8_to_v32i8:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512F-NEXT: vpmovsxwd (%rdi), %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpmovsxwd 32(%rdi), %zmm1
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v64i8_to_v32i8:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VL-NEXT: vpmovsxwd (%rdi), %zmm0
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: vpmovsxwd 32(%rdi), %zmm1
+; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu8 (%rdi), %zmm0
-; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512BWVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512BWVL-NEXT: vmovdqu %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
@@ -106,54 +94,12 @@ define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
}
define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512F-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512F-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512F-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
-; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0
-; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,2,4,6,16,18,20,22,8,10,12,14,24,26,28,30]
-; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
-; AVX512BWVL-NEXT: vmovdqu %ymm0, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0
+; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
%strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
store <16 x i16> %strided.vec, <16 x i16>* %S
@@ -177,11 +123,8 @@ define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
; AVX512-LABEL: shuffle_v16i32_to_v8i32:
; AVX512: # BB#0:
-; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %L
@@ -205,127 +148,12 @@ define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
}
define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
-; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm1
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu8 (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %ecx
-; AVX512BWVL-NEXT: vmovd %ecx, %xmm1
-; AVX512BWVL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BWVL-NEXT: vpextrb $0, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BWVL-NEXT: vpextrb $0, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0
-; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
store <16 x i8> %strided.vec, <16 x i8>* %S
@@ -347,99 +175,12 @@ define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
}
define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
-; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vmovss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; AVX512BW-NEXT: vpextrw $4, %xmm0, %eax
-; AVX512BW-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BW-NEXT: vmovd %xmm2, %eax
-; AVX512BW-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrw $4, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT: vmovd %xmm2, %eax
-; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrw $4, %xmm2, %eax
-; AVX512BW-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpextrw $4, %xmm0, %eax
-; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vmovss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; AVX512BWVL-NEXT: vpextrw $4, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BWVL-NEXT: vmovd %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrw $4, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BWVL-NEXT: vmovd %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrw $4, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512BWVL-NEXT: vmovd %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrw $4, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
-; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
%strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
store <8 x i16> %strided.vec, <8 x i16>* %S
@@ -461,95 +202,12 @@ define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
}
define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %r8d
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %r9d
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %r10d
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %r11d
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm1
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $8, %xmm0, %edx
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %edi
-; AVX512BW-NEXT: vmovd %edi, %xmm0
-; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $4, %r11d, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu8 (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %ecx
-; AVX512BWVL-NEXT: vmovd %ecx, %xmm1
-; AVX512BWVL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BWVL-NEXT: vpextrb $0, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BWVL-NEXT: vpextrb $0, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BWVL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm0
-; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
store <8 x i8> %strided.vec, <8 x i8>* %S
diff --git a/test/CodeGen/X86/sink-blockfreq.ll b/test/CodeGen/X86/sink-blockfreq.ll
index 5436cf248bd5..d0b8972cee50 100644
--- a/test/CodeGen/X86/sink-blockfreq.ll
+++ b/test/CodeGen/X86/sink-blockfreq.ll
@@ -2,7 +2,7 @@
; RUN: llc -disable-preheader-prot=true -disable-machine-licm -machine-sink-bfi=false -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_NOBFI
; Test that by changing BlockFrequencyInfo we change the order in which
-; machine-sink looks for sucessor blocks. By not using BFI, both G and B
+; machine-sink looks for successor blocks. By not using BFI, both G and B
; have the same loop depth and no instructions is sinked - B is selected but
; can't be used as to avoid breaking a non profitable critical edge. By using
; BFI, "mul" is sinked into the less frequent block G.
diff --git a/test/CodeGen/X86/sink-gep-before-mem-inst.ll b/test/CodeGen/X86/sink-gep-before-mem-inst.ll
new file mode 100644
index 000000000000..b9c94adda993
--- /dev/null
+++ b/test/CodeGen/X86/sink-gep-before-mem-inst.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s -S -codegenprepare -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define i64 @test.after(i8 addrspace(1)* readonly align 8) {
+; CHECK-LABEL: test.after
+; CHECK: sunkaddr
+entry:
+ %.0 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 8
+ %addr = bitcast i8 addrspace(1)* %.0 to i32 addrspace(1)*
+ br label %header
+
+header:
+ %addr.in.loop = phi i32 addrspace(1)* [ %addr, %entry ], [ %addr.after, %header ]
+ %local_2_ = phi i64 [ 0, %entry ], [ %.9, %header ]
+ %.7 = load i32, i32 addrspace(1)* %addr.in.loop, align 8
+ fence acquire
+ %.1 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 8
+ %addr.after = bitcast i8 addrspace(1)* %.1 to i32 addrspace(1)*
+ %.8 = sext i32 %.7 to i64
+ %.9 = add i64 %local_2_, %.8
+ %not. = icmp sgt i64 %.9, 999
+ br i1 %not., label %exit, label %header
+
+exit:
+ ret i64 %.9
+}
diff --git a/test/CodeGen/X86/soft-fp-legal-in-HW-reg.ll b/test/CodeGen/X86/soft-fp-legal-in-HW-reg.ll
new file mode 100644
index 000000000000..0461ee809efb
--- /dev/null
+++ b/test/CodeGen/X86/soft-fp-legal-in-HW-reg.ll
@@ -0,0 +1,55 @@
+; RUN: llc < %s -mtriple=x86_64-linux-android -mattr=+mmx -enable-legalize-types-checking | FileCheck %s
+;
+; D31946
+; Check that we dont end up with the ""LLVM ERROR: Cannot select" error.
+; Additionally ensure that the output code actually put fp128 values in SSE registers.
+
+declare fp128 @llvm.fabs.f128(fp128)
+declare fp128 @llvm.copysign.f128(fp128, fp128)
+
+define fp128 @TestSelect(fp128 %a, fp128 %b) {
+ %cmp = fcmp ogt fp128 %a, %b
+ %sub = fsub fp128 %a, %b
+ %res = select i1 %cmp, fp128 %sub, fp128 0xL00000000000000000000000000000000
+ ret fp128 %res
+; CHECK-LABEL: TestSelect:
+; CHECK movaps 16(%rsp), %xmm1
+; CHECK-NEXT callq __subtf3
+; CHECK-NEXT testl %ebx, %ebx
+; CHECK-NEXT jg .LBB0_2
+; CHECK-NEXT # BB#1:
+; CHECK-NEXT movaps .LCPI0_0(%rip), %xmm0
+; CHECK-NEXT .LBB0_2:
+; CHECK-NEXT addq $32, %rsp
+; CHECK-NEXT popq %rbx
+; CHECK-NEXT retq
+}
+
+define fp128 @TestFabs(fp128 %a) {
+ %res = call fp128 @llvm.fabs.f128(fp128 %a)
+ ret fp128 %res
+; CHECK-LABEL: TestFabs:
+; CHECK andps .LCPI1_0(%rip), %xmm0
+; CHECK-NEXT retq
+}
+
+define fp128 @TestCopysign(fp128 %a, fp128 %b) {
+ %res = call fp128 @llvm.copysign.f128(fp128 %a, fp128 %b)
+ ret fp128 %res
+; CHECK-LABEL: TestCopysign:
+; CHECK andps .LCPI2_1(%rip), %xmm0
+; CHECK-NEXT orps %xmm1, %xmm0
+; CHECK-NEXT retq
+}
+
+define fp128 @TestFneg(fp128 %a) {
+ %mul = fmul fp128 %a, %a
+ %res = fsub fp128 0xL00000000000000008000000000000000, %mul
+ ret fp128 %res
+; CHECK-LABEL: TestFneg:
+; CHECK movaps %xmm0, %xmm1
+; CHECK-NEXT callq __multf3
+; CHECK-NEXT xorps .LCPI3_0(%rip), %xmm0
+; CHECK-NEXT popq %rax
+; CHECK-NEXT retq
+}
diff --git a/test/CodeGen/X86/sse-schedule.ll b/test/CodeGen/X86/sse-schedule.ll
index 52e6b61aedfe..c41acd43b3ab 100644
--- a/test/CodeGen/X86/sse-schedule.ll
+++ b/test/CodeGen/X86/sse-schedule.ll
@@ -31,8 +31,8 @@ define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_addps:
; SANDY: # BB#0:
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addps:
; HASWELL: # BB#0:
@@ -73,8 +73,8 @@ define float @test_addss(float %a0, float %a1, float *%a2) {
; SANDY-LABEL: test_addss:
; SANDY: # BB#0:
; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addss:
; HASWELL: # BB#0:
@@ -122,9 +122,9 @@ define <4 x float> @test_andps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
;
; SANDY-LABEL: test_andps:
; SANDY: # BB#0:
-; SANDY-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andps:
; HASWELL: # BB#0:
@@ -176,9 +176,9 @@ define <4 x float> @test_andnotps(<4 x float> %a0, <4 x float> %a1, <4 x float>
;
; SANDY-LABEL: test_andnotps:
; SANDY: # BB#0:
-; SANDY-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andnotps:
; HASWELL: # BB#0:
@@ -228,9 +228,9 @@ define <4 x float> @test_cmpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_cmpps:
; SANDY: # BB#0:
; SANDY-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmpps:
; HASWELL: # BB#0:
@@ -277,7 +277,7 @@ define float @test_cmpss(float %a0, float %a1, float *%a2) {
; SANDY: # BB#0:
; SANDY-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmpss:
; HASWELL: # BB#0:
@@ -347,16 +347,16 @@ define i32 @test_comiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; SANDY-LABEL: test_comiss:
; SANDY: # BB#0:
; SANDY-NEXT: vcomiss %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %cl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %cl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
; SANDY-NEXT: vcomiss (%rdi), %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %dl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %dl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %dl # sched: [1:0.33]
; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33]
; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_comiss:
; HASWELL: # BB#0:
@@ -417,10 +417,10 @@ define float @test_cvtsi2ss(i32 %a0, i32 *%a1) {
;
; SANDY-LABEL: test_cvtsi2ss:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsi2ss:
; HASWELL: # BB#0:
@@ -466,10 +466,10 @@ define float @test_cvtsi2ssq(i64 %a0, i64 *%a1) {
;
; SANDY-LABEL: test_cvtsi2ssq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsi2ssq:
; HASWELL: # BB#0:
@@ -515,10 +515,10 @@ define i32 @test_cvtss2si(float %a0, float *%a1) {
;
; SANDY-LABEL: test_cvtss2si:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtss2si %xmm0, %ecx # sched: [3:1.00]
-; SANDY-NEXT: vcvtss2si (%rdi), %eax # sched: [7:1.00]
+; SANDY-NEXT: vcvtss2si %xmm0, %ecx # sched: [5:1.00]
+; SANDY-NEXT: vcvtss2si (%rdi), %eax # sched: [10:1.00]
; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtss2si:
; HASWELL: # BB#0:
@@ -567,10 +567,10 @@ define i64 @test_cvtss2siq(float %a0, float *%a1) {
;
; SANDY-LABEL: test_cvtss2siq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtss2si %xmm0, %rcx # sched: [3:1.00]
-; SANDY-NEXT: vcvtss2si (%rdi), %rax # sched: [7:1.00]
+; SANDY-NEXT: vcvtss2si %xmm0, %rcx # sched: [5:1.00]
+; SANDY-NEXT: vcvtss2si (%rdi), %rax # sched: [10:1.00]
; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtss2siq:
; HASWELL: # BB#0:
@@ -619,10 +619,10 @@ define i32 @test_cvttss2si(float %a0, float *%a1) {
;
; SANDY-LABEL: test_cvttss2si:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvttss2si %xmm0, %ecx # sched: [3:1.00]
-; SANDY-NEXT: vcvttss2si (%rdi), %eax # sched: [7:1.00]
+; SANDY-NEXT: vcvttss2si %xmm0, %ecx # sched: [5:1.00]
+; SANDY-NEXT: vcvttss2si (%rdi), %eax # sched: [10:1.00]
; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttss2si:
; HASWELL: # BB#0:
@@ -668,10 +668,10 @@ define i64 @test_cvttss2siq(float %a0, float *%a1) {
;
; SANDY-LABEL: test_cvttss2siq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvttss2si %xmm0, %rcx # sched: [3:1.00]
-; SANDY-NEXT: vcvttss2si (%rdi), %rax # sched: [7:1.00]
+; SANDY-NEXT: vcvttss2si %xmm0, %rcx # sched: [5:1.00]
+; SANDY-NEXT: vcvttss2si (%rdi), %rax # sched: [10:1.00]
; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttss2siq:
; HASWELL: # BB#0:
@@ -714,9 +714,9 @@ define <4 x float> @test_divps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
;
; SANDY-LABEL: test_divps:
; SANDY: # BB#0:
-; SANDY-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [20:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divps:
; HASWELL: # BB#0:
@@ -756,9 +756,9 @@ define float @test_divss(float %a0, float %a1, float *%a2) {
;
; SANDY-LABEL: test_divss:
; SANDY: # BB#0:
-; SANDY-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [20:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divss:
; HASWELL: # BB#0:
@@ -799,8 +799,8 @@ define void @test_ldmxcsr(i32 %a0) {
; SANDY-LABEL: test_ldmxcsr:
; SANDY: # BB#0:
; SANDY-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
-; SANDY-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_ldmxcsr:
; HASWELL: # BB#0:
@@ -843,8 +843,8 @@ define <4 x float> @test_maxps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_maxps:
; SANDY: # BB#0:
; SANDY-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxps:
; HASWELL: # BB#0:
@@ -886,8 +886,8 @@ define <4 x float> @test_maxss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_maxss:
; SANDY: # BB#0:
; SANDY-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxss:
; HASWELL: # BB#0:
@@ -929,8 +929,8 @@ define <4 x float> @test_minps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_minps:
; SANDY: # BB#0:
; SANDY-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minps:
; HASWELL: # BB#0:
@@ -972,8 +972,8 @@ define <4 x float> @test_minss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_minss:
; SANDY: # BB#0:
; SANDY-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minss:
; HASWELL: # BB#0:
@@ -1017,10 +1017,10 @@ define void @test_movaps(<4 x float> *%a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_movaps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovaps (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovaps (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovaps %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movaps:
; HASWELL: # BB#0:
@@ -1068,7 +1068,7 @@ define <4 x float> @test_movhlps(<4 x float> %a0, <4 x float> %a1) {
; SANDY-LABEL: test_movhlps:
; SANDY: # BB#0:
; SANDY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movhlps:
; HASWELL: # BB#0:
@@ -1111,10 +1111,10 @@ define void @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) {
;
; SANDY-LABEL: test_movhps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movhps:
; HASWELL: # BB#0:
@@ -1164,7 +1164,7 @@ define <4 x float> @test_movlhps(<4 x float> %a0, <4 x float> %a1) {
; SANDY: # BB#0:
; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movlhps:
; HASWELL: # BB#0:
@@ -1206,10 +1206,10 @@ define void @test_movlps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) {
;
; SANDY-LABEL: test_movlps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00]
+; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovlps %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movlps:
; HASWELL: # BB#0:
@@ -1254,8 +1254,8 @@ define i32 @test_movmskps(<4 x float> %a0) {
;
; SANDY-LABEL: test_movmskps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovmskps %xmm0, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovmskps %xmm0, %eax # sched: [2:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movmskps:
; HASWELL: # BB#0:
@@ -1295,8 +1295,8 @@ define void @test_movntps(<4 x float> %a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_movntps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntps %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntps:
; HASWELL: # BB#0:
@@ -1335,10 +1335,10 @@ define void @test_movss_mem(float* %a0, float* %a1) {
;
; SANDY-LABEL: test_movss_mem:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovss %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movss_mem:
; HASWELL: # BB#0:
@@ -1383,8 +1383,8 @@ define <4 x float> @test_movss_reg(<4 x float> %a0, <4 x float> %a1) {
;
; SANDY-LABEL: test_movss_reg:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movss_reg:
; HASWELL: # BB#0:
@@ -1423,10 +1423,10 @@ define void @test_movups(<4 x float> *%a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_movups:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movups:
; HASWELL: # BB#0:
@@ -1469,8 +1469,8 @@ define <4 x float> @test_mulps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_mulps:
; SANDY: # BB#0:
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulps:
; HASWELL: # BB#0:
@@ -1511,8 +1511,8 @@ define float @test_mulss(float %a0, float %a1, float *%a2) {
; SANDY-LABEL: test_mulss:
; SANDY: # BB#0:
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulss:
; HASWELL: # BB#0:
@@ -1560,9 +1560,9 @@ define <4 x float> @test_orps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
;
; SANDY-LABEL: test_orps:
; SANDY: # BB#0:
-; SANDY-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_orps:
; HASWELL: # BB#0:
@@ -1609,8 +1609,8 @@ define void @test_prefetchnta(i8* %a0) {
;
; SANDY-LABEL: test_prefetchnta:
; SANDY: # BB#0:
-; SANDY-NEXT: prefetchnta (%rdi) # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: prefetchnta (%rdi) # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_prefetchnta:
; HASWELL: # BB#0:
@@ -1652,10 +1652,10 @@ define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_rcpps:
; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vrcpps (%rdi), %xmm1 # sched: [9:1.00]
+; SANDY-NEXT: vrcpps %xmm0, %xmm0 # sched: [7:3.00]
+; SANDY-NEXT: vrcpps (%rdi), %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rcpps:
; HASWELL: # BB#0:
@@ -1708,10 +1708,10 @@ define <4 x float> @test_rcpss(float %a0, float *%a1) {
; SANDY-LABEL: test_rcpss:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rcpss:
; HASWELL: # BB#0:
@@ -1765,9 +1765,9 @@ define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_rsqrtps:
; SANDY: # BB#0:
; SANDY-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [9:1.00]
+; SANDY-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rsqrtps:
; HASWELL: # BB#0:
@@ -1819,11 +1819,11 @@ define <4 x float> @test_rsqrtss(float %a0, float *%a1) {
;
; SANDY-LABEL: test_rsqrtss:
; SANDY: # BB#0:
-; SANDY-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; SANDY-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [9:1.00]
+; SANDY-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; SANDY-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rsqrtss:
; HASWELL: # BB#0:
@@ -1875,7 +1875,7 @@ define void @test_sfence() {
; SANDY-LABEL: test_sfence:
; SANDY: # BB#0:
; SANDY-NEXT: sfence # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sfence:
; HASWELL: # BB#0:
@@ -1917,8 +1917,8 @@ define <4 x float> @test_shufps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
; SANDY-LABEL: test_shufps:
; SANDY: # BB#0:
; SANDY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
-; SANDY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_shufps:
; HASWELL: # BB#0:
@@ -1962,10 +1962,10 @@ define <4 x float> @test_sqrtps(<4 x float> %a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_sqrtps:
; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtps %xmm0, %xmm0 # sched: [15:1.00]
-; SANDY-NEXT: vsqrtps (%rdi), %xmm1 # sched: [19:1.00]
+; SANDY-NEXT: vsqrtps %xmm0, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT: vsqrtps (%rdi), %xmm1 # sched: [20:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtps:
; HASWELL: # BB#0:
@@ -2017,11 +2017,11 @@ define <4 x float> @test_sqrtss(<4 x float> %a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_sqrtss:
; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [19:1.00]
-; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [4:0.50]
-; SANDY-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [19:1.00]
+; SANDY-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [114:1.00]
+; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [6:0.50]
+; SANDY-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [114:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtss:
; HASWELL: # BB#0:
@@ -2067,9 +2067,9 @@ define i32 @test_stmxcsr() {
;
; SANDY-LABEL: test_stmxcsr:
; SANDY: # BB#0:
-; SANDY-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00]
-; SANDY-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; SANDY-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_stmxcsr:
; HASWELL: # BB#0:
@@ -2112,8 +2112,8 @@ define <4 x float> @test_subps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_subps:
; SANDY: # BB#0:
; SANDY-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subps:
; HASWELL: # BB#0:
@@ -2154,8 +2154,8 @@ define float @test_subss(float %a0, float %a1, float *%a2) {
; SANDY-LABEL: test_subss:
; SANDY: # BB#0:
; SANDY-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subss:
; HASWELL: # BB#0:
@@ -2220,16 +2220,16 @@ define i32 @test_ucomiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; SANDY-LABEL: test_ucomiss:
; SANDY: # BB#0:
; SANDY-NEXT: vucomiss %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %cl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %cl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
; SANDY-NEXT: vucomiss (%rdi), %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %dl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %dl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %dl # sched: [1:0.33]
; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33]
; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_ucomiss:
; HASWELL: # BB#0:
@@ -2292,8 +2292,8 @@ define <4 x float> @test_unpckhps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; SANDY-LABEL: test_unpckhps:
; SANDY: # BB#0:
; SANDY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SANDY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpckhps:
; HASWELL: # BB#0:
@@ -2338,8 +2338,8 @@ define <4 x float> @test_unpcklps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; SANDY-LABEL: test_unpcklps:
; SANDY: # BB#0:
; SANDY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SANDY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpcklps:
; HASWELL: # BB#0:
@@ -2387,9 +2387,9 @@ define <4 x float> @test_xorps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
;
; SANDY-LABEL: test_xorps:
; SANDY: # BB#0:
-; SANDY-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_xorps:
; HASWELL: # BB#0:
diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll
index 14c155c8c6c0..3c36b2138139 100644
--- a/test/CodeGen/X86/sse2-schedule.ll
+++ b/test/CodeGen/X86/sse2-schedule.ll
@@ -31,8 +31,8 @@ define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_addpd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addpd:
; HASWELL: # BB#0:
@@ -73,8 +73,8 @@ define double @test_addsd(double %a0, double %a1, double *%a2) {
; SANDY-LABEL: test_addsd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addsd:
; HASWELL: # BB#0:
@@ -117,10 +117,10 @@ define <2 x double> @test_andpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
;
; SANDY-LABEL: test_andpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andpd:
; HASWELL: # BB#0:
@@ -170,10 +170,10 @@ define <2 x double> @test_andnotpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
;
; SANDY-LABEL: test_andnotpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andnotpd:
; HASWELL: # BB#0:
@@ -226,9 +226,9 @@ define <2 x double> @test_cmppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_cmppd:
; SANDY: # BB#0:
; SANDY-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmppd:
; HASWELL: # BB#0:
@@ -275,7 +275,7 @@ define double @test_cmpsd(double %a0, double %a1, double *%a2) {
; SANDY: # BB#0:
; SANDY-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmpsd:
; HASWELL: # BB#0:
@@ -345,16 +345,16 @@ define i32 @test_comisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; SANDY-LABEL: test_comisd:
; SANDY: # BB#0:
; SANDY-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %cl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %cl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
; SANDY-NEXT: vcomisd (%rdi), %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %dl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %dl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %dl # sched: [1:0.33]
; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33]
; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_comisd:
; HASWELL: # BB#0:
@@ -416,9 +416,9 @@ define <2 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
; SANDY-LABEL: test_cvtdq2pd:
; SANDY: # BB#0:
; SANDY-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtdq2pd:
; HASWELL: # BB#0:
@@ -467,10 +467,10 @@ define <4 x float> @test_cvtdq2ps(<4 x i32> %a0, <4 x i32> *%a1) {
;
; SANDY-LABEL: test_cvtdq2ps:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtdq2ps:
; HASWELL: # BB#0:
@@ -517,10 +517,10 @@ define <4 x i32> @test_cvtpd2dq(<2 x double> %a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_cvtpd2dq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtpd2dq:
; HASWELL: # BB#0:
@@ -568,10 +568,10 @@ define <4 x float> @test_cvtpd2ps(<2 x double> %a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_cvtpd2ps:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtpd2ps:
; HASWELL: # BB#0:
@@ -620,9 +620,9 @@ define <4 x i32> @test_cvtps2dq(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_cvtps2dq:
; SANDY: # BB#0:
; SANDY-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtps2dq:
; HASWELL: # BB#0:
@@ -670,10 +670,10 @@ define <2 x double> @test_cvtps2pd(<4 x float> %a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_cvtps2pd:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [2:1.00]
; SANDY-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtps2pd:
; HASWELL: # BB#0:
@@ -724,7 +724,7 @@ define i32 @test_cvtsd2si(double %a0, double *%a1) {
; SANDY-NEXT: vcvtsd2si %xmm0, %ecx # sched: [3:1.00]
; SANDY-NEXT: vcvtsd2si (%rdi), %eax # sched: [7:1.00]
; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsd2si:
; HASWELL: # BB#0:
@@ -773,10 +773,10 @@ define i64 @test_cvtsd2siq(double %a0, double *%a1) {
;
; SANDY-LABEL: test_cvtsd2siq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtsd2si %xmm0, %rcx # sched: [3:1.00]
-; SANDY-NEXT: vcvtsd2si (%rdi), %rax # sched: [7:1.00]
+; SANDY-NEXT: vcvtsd2si %xmm0, %rcx # sched: [5:1.00]
+; SANDY-NEXT: vcvtsd2si (%rdi), %rax # sched: [10:1.00]
; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsd2siq:
; HASWELL: # BB#0:
@@ -830,10 +830,10 @@ define float @test_cvtsd2ss(double %a0, double *%a1) {
; SANDY-LABEL: test_cvtsd2ss:
; SANDY: # BB#0:
; SANDY-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50]
+; SANDY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50]
; SANDY-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [3:1.00]
; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsd2ss:
; HASWELL: # BB#0:
@@ -882,9 +882,9 @@ define double @test_cvtsi2sd(i32 %a0, i32 *%a1) {
; SANDY-LABEL: test_cvtsi2sd:
; SANDY: # BB#0:
; SANDY-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsi2sd:
; HASWELL: # BB#0:
@@ -931,9 +931,9 @@ define double @test_cvtsi2sdq(i64 %a0, i64 *%a1) {
; SANDY-LABEL: test_cvtsi2sdq:
; SANDY: # BB#0:
; SANDY-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsi2sdq:
; HASWELL: # BB#0:
@@ -985,11 +985,11 @@ define double @test_cvtss2sd(float %a0, float *%a1) {
;
; SANDY-LABEL: test_cvtss2sd:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; SANDY-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [3:1.00]
+; SANDY-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; SANDY-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtss2sd:
; HASWELL: # BB#0:
@@ -1038,10 +1038,10 @@ define <4 x i32> @test_cvttpd2dq(<2 x double> %a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_cvttpd2dq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttpd2dq:
; HASWELL: # BB#0:
@@ -1091,9 +1091,9 @@ define <4 x i32> @test_cvttps2dq(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_cvttps2dq:
; SANDY: # BB#0:
; SANDY-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttps2dq:
; HASWELL: # BB#0:
@@ -1139,10 +1139,10 @@ define i32 @test_cvttsd2si(double %a0, double *%a1) {
;
; SANDY-LABEL: test_cvttsd2si:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvttsd2si %xmm0, %ecx # sched: [3:1.00]
+; SANDY-NEXT: vcvttsd2si %xmm0, %ecx # sched: [5:1.00]
; SANDY-NEXT: vcvttsd2si (%rdi), %eax # sched: [7:1.00]
; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttsd2si:
; HASWELL: # BB#0:
@@ -1188,10 +1188,10 @@ define i64 @test_cvttsd2siq(double %a0, double *%a1) {
;
; SANDY-LABEL: test_cvttsd2siq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvttsd2si %xmm0, %rcx # sched: [3:1.00]
-; SANDY-NEXT: vcvttsd2si (%rdi), %rax # sched: [7:1.00]
+; SANDY-NEXT: vcvttsd2si %xmm0, %rcx # sched: [5:1.00]
+; SANDY-NEXT: vcvttsd2si (%rdi), %rax # sched: [10:1.00]
; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttsd2siq:
; HASWELL: # BB#0:
@@ -1234,9 +1234,9 @@ define <2 x double> @test_divpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
;
; SANDY-LABEL: test_divpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [22:1.00]
+; SANDY-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [28:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divpd:
; HASWELL: # BB#0:
@@ -1276,9 +1276,9 @@ define double @test_divsd(double %a0, double %a1, double *%a2) {
;
; SANDY-LABEL: test_divsd:
; SANDY: # BB#0:
-; SANDY-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [22:1.00]
+; SANDY-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [28:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divsd:
; HASWELL: # BB#0:
@@ -1322,7 +1322,7 @@ define void @test_lfence() {
; SANDY-LABEL: test_lfence:
; SANDY: # BB#0:
; SANDY-NEXT: lfence # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_lfence:
; HASWELL: # BB#0:
@@ -1363,7 +1363,7 @@ define void @test_mfence() {
; SANDY-LABEL: test_mfence:
; SANDY: # BB#0:
; SANDY-NEXT: mfence # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mfence:
; HASWELL: # BB#0:
@@ -1402,7 +1402,7 @@ define void @test_maskmovdqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) {
; SANDY-LABEL: test_maskmovdqu:
; SANDY: # BB#0:
; SANDY-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maskmovdqu:
; HASWELL: # BB#0:
@@ -1440,8 +1440,8 @@ define <2 x double> @test_maxpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_maxpd:
; SANDY: # BB#0:
; SANDY-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxpd:
; HASWELL: # BB#0:
@@ -1483,8 +1483,8 @@ define <2 x double> @test_maxsd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_maxsd:
; SANDY: # BB#0:
; SANDY-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxsd:
; HASWELL: # BB#0:
@@ -1526,8 +1526,8 @@ define <2 x double> @test_minpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_minpd:
; SANDY: # BB#0:
; SANDY-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minpd:
; HASWELL: # BB#0:
@@ -1569,8 +1569,8 @@ define <2 x double> @test_minsd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_minsd:
; SANDY: # BB#0:
; SANDY-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minsd:
; HASWELL: # BB#0:
@@ -1614,10 +1614,10 @@ define void @test_movapd(<2 x double> *%a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_movapd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovapd (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovapd (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovapd %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movapd:
; HASWELL: # BB#0:
@@ -1662,10 +1662,10 @@ define void @test_movdqa(<2 x i64> *%a0, <2 x i64> *%a1) {
;
; SANDY-LABEL: test_movdqa:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovdqa (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovdqa %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movdqa:
; HASWELL: # BB#0:
@@ -1710,10 +1710,10 @@ define void @test_movdqu(<2 x i64> *%a0, <2 x i64> *%a1) {
;
; SANDY-LABEL: test_movdqu:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovdqu (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovdqu (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovdqu %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movdqu:
; HASWELL: # BB#0:
@@ -1768,12 +1768,12 @@ define i32 @test_movd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
; SANDY-LABEL: test_movd:
; SANDY: # BB#0:
; SANDY-NEXT: vmovd %edi, %xmm1 # sched: [1:0.33]
-; SANDY-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; SANDY-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovd %xmm0, %eax # sched: [1:0.33]
-; SANDY-NEXT: vmovd %xmm1, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovd %xmm0, %eax # sched: [2:1.00]
+; SANDY-NEXT: vmovd %xmm1, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movd:
; HASWELL: # BB#0:
@@ -1838,13 +1838,13 @@ define i64 @test_movd_64(<2 x i64> %a0, i64 %a1, i64 *%a2) {
;
; SANDY-LABEL: test_movd_64:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovq %rdi, %xmm1 # sched: [1:0.33]
-; SANDY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [4:0.50]
+; SANDY-NEXT: vmovq %rdi, %xmm1 # sched: [1:1.00]
+; SANDY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [6:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; SANDY-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovq %xmm0, %rax # sched: [1:0.33]
-; SANDY-NEXT: vmovq %xmm1, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovq %xmm0, %rax # sched: [2:1.00]
+; SANDY-NEXT: vmovq %xmm1, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movd_64:
; HASWELL: # BB#0:
@@ -1900,10 +1900,10 @@ define void @test_movhpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) {
;
; SANDY-LABEL: test_movhpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovhpd %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movhpd:
; HASWELL: # BB#0:
@@ -1951,10 +1951,10 @@ define void @test_movlpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) {
;
; SANDY-LABEL: test_movlpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00]
+; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovlpd %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movlpd:
; HASWELL: # BB#0:
@@ -1998,8 +1998,8 @@ define i32 @test_movmskpd(<2 x double> %a0) {
;
; SANDY-LABEL: test_movmskpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovmskpd %xmm0, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovmskpd %xmm0, %eax # sched: [2:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movmskpd:
; HASWELL: # BB#0:
@@ -2039,8 +2039,8 @@ define void @test_movntdqa(<2 x i64> %a0, <2 x i64> *%a1) {
; SANDY-LABEL: test_movntdqa:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntdq %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntdqa:
; HASWELL: # BB#0:
@@ -2080,8 +2080,8 @@ define void @test_movntpd(<2 x double> %a0, <2 x double> *%a1) {
; SANDY-LABEL: test_movntpd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntpd %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntpd:
; HASWELL: # BB#0:
@@ -2123,10 +2123,10 @@ define <2 x i64> @test_movq_mem(<2 x i64> %a0, i64 *%a1) {
;
; SANDY-LABEL: test_movq_mem:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50]
+; SANDY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovq %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movq_mem:
; HASWELL: # BB#0:
@@ -2174,7 +2174,7 @@ define <2 x i64> @test_movq_reg(<2 x i64> %a0, <2 x i64> %a1) {
; SANDY: # BB#0:
; SANDY-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33]
; SANDY-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movq_reg:
; HASWELL: # BB#0:
@@ -2216,10 +2216,10 @@ define void @test_movsd_mem(double* %a0, double* %a1) {
;
; SANDY-LABEL: test_movsd_mem:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [4:0.50]
+; SANDY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50]
; SANDY-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovsd %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movsd_mem:
; HASWELL: # BB#0:
@@ -2266,7 +2266,7 @@ define <2 x double> @test_movsd_reg(<2 x double> %a0, <2 x double> %a1) {
; SANDY-LABEL: test_movsd_reg:
; SANDY: # BB#0:
; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movsd_reg:
; HASWELL: # BB#0:
@@ -2305,10 +2305,10 @@ define void @test_movupd(<2 x double> *%a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_movupd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovupd (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovupd (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movupd:
; HASWELL: # BB#0:
@@ -2351,8 +2351,8 @@ define <2 x double> @test_mulpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_mulpd:
; SANDY: # BB#0:
; SANDY-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulpd:
; HASWELL: # BB#0:
@@ -2393,8 +2393,8 @@ define double @test_mulsd(double %a0, double %a1, double *%a2) {
; SANDY-LABEL: test_mulsd:
; SANDY: # BB#0:
; SANDY-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulsd:
; HASWELL: # BB#0:
@@ -2437,10 +2437,10 @@ define <2 x double> @test_orpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
;
; SANDY-LABEL: test_orpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_orpd:
; HASWELL: # BB#0:
@@ -2496,8 +2496,8 @@ define <8 x i16> @test_packssdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_packssdw:
; SANDY: # BB#0:
; SANDY-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_packssdw:
; HASWELL: # BB#0:
@@ -2548,8 +2548,8 @@ define <16 x i8> @test_packsswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_packsswb:
; SANDY: # BB#0:
; SANDY-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_packsswb:
; HASWELL: # BB#0:
@@ -2600,8 +2600,8 @@ define <16 x i8> @test_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_packuswb:
; SANDY: # BB#0:
; SANDY-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_packuswb:
; HASWELL: # BB#0:
@@ -2648,8 +2648,8 @@ define <16 x i8> @test_paddb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_paddb:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddb:
; HASWELL: # BB#0:
@@ -2694,8 +2694,8 @@ define <4 x i32> @test_paddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_paddd:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddd:
; HASWELL: # BB#0:
@@ -2736,8 +2736,8 @@ define <2 x i64> @test_paddq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SANDY-LABEL: test_paddq:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddq:
; HASWELL: # BB#0:
@@ -2781,9 +2781,9 @@ define <16 x i8> @test_paddsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; SANDY-LABEL: test_paddsb:
; SANDY: # BB#0:
-; SANDY-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddsb:
; HASWELL: # BB#0:
@@ -2828,9 +2828,9 @@ define <8 x i16> @test_paddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_paddsw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddsw:
; HASWELL: # BB#0:
@@ -2876,8 +2876,8 @@ define <16 x i8> @test_paddusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_paddusb:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddusb:
; HASWELL: # BB#0:
@@ -2923,8 +2923,8 @@ define <8 x i16> @test_paddusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_paddusw:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddusw:
; HASWELL: # BB#0:
@@ -2969,9 +2969,9 @@ define <8 x i16> @test_paddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_paddw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddw:
; HASWELL: # BB#0:
@@ -3015,9 +3015,9 @@ define <2 x i64> @test_pand(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SANDY-LABEL: test_pand:
; SANDY: # BB#0:
; SANDY-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pand:
; HASWELL: # BB#0:
@@ -3070,9 +3070,9 @@ define <2 x i64> @test_pandn(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SANDY-LABEL: test_pandn:
; SANDY: # BB#0:
; SANDY-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pandn:
; HASWELL: # BB#0:
@@ -3122,8 +3122,8 @@ define <16 x i8> @test_pavgb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pavgb:
; SANDY: # BB#0:
; SANDY-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pavgb:
; HASWELL: # BB#0:
@@ -3169,8 +3169,8 @@ define <8 x i16> @test_pavgw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pavgw:
; SANDY: # BB#0:
; SANDY-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pavgw:
; HASWELL: # BB#0:
@@ -3217,9 +3217,9 @@ define <16 x i8> @test_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pcmpeqb:
; SANDY: # BB#0:
; SANDY-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpeqb:
; HASWELL: # BB#0:
@@ -3269,9 +3269,9 @@ define <4 x i32> @test_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_pcmpeqd:
; SANDY: # BB#0:
; SANDY-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpeqd:
; HASWELL: # BB#0:
@@ -3321,9 +3321,9 @@ define <8 x i16> @test_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pcmpeqw:
; SANDY: # BB#0:
; SANDY-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpeqw:
; HASWELL: # BB#0:
@@ -3374,9 +3374,9 @@ define <16 x i8> @test_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pcmpgtb:
; SANDY: # BB#0:
; SANDY-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpgtb:
; HASWELL: # BB#0:
@@ -3427,9 +3427,9 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_pcmpgtd:
; SANDY: # BB#0:
; SANDY-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpgtd:
; HASWELL: # BB#0:
@@ -3480,9 +3480,9 @@ define <8 x i16> @test_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pcmpgtw:
; SANDY: # BB#0:
; SANDY-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpgtw:
; HASWELL: # BB#0:
@@ -3526,9 +3526,9 @@ define i16 @test_pextrw(<8 x i16> %a0) {
;
; SANDY-LABEL: test_pextrw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:0.50]
+; SANDY-NEXT: vpextrw $6, %xmm0, %eax # sched: [3:1.00]
; SANDY-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pextrw:
; HASWELL: # BB#0:
@@ -3570,9 +3570,9 @@ define <8 x i16> @test_pinsrw(<8 x i16> %a0, i16 %a1, i16 *%a2) {
;
; SANDY-LABEL: test_pinsrw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pinsrw:
; HASWELL: # BB#0:
@@ -3620,9 +3620,9 @@ define <4 x i32> @test_pmaddwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_pmaddwd:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaddwd:
; HASWELL: # BB#0:
@@ -3669,8 +3669,8 @@ define <8 x i16> @test_pmaxsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pmaxsw:
; SANDY: # BB#0:
; SANDY-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxsw:
; HASWELL: # BB#0:
@@ -3716,8 +3716,8 @@ define <16 x i8> @test_pmaxub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pmaxub:
; SANDY: # BB#0:
; SANDY-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxub:
; HASWELL: # BB#0:
@@ -3763,8 +3763,8 @@ define <8 x i16> @test_pminsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pminsw:
; SANDY: # BB#0:
; SANDY-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminsw:
; HASWELL: # BB#0:
@@ -3810,8 +3810,8 @@ define <16 x i8> @test_pminub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pminub:
; SANDY: # BB#0:
; SANDY-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminub:
; HASWELL: # BB#0:
@@ -3851,8 +3851,8 @@ define i32 @test_pmovmskb(<16 x i8> %a0) {
;
; SANDY-LABEL: test_pmovmskb:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmovmskb %xmm0, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmovmskb %xmm0, %eax # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovmskb:
; HASWELL: # BB#0:
@@ -3891,7 +3891,7 @@ define <8 x i16> @test_pmulhuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY: # BB#0:
; SANDY-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmulhuw:
; HASWELL: # BB#0:
@@ -3932,9 +3932,9 @@ define <8 x i16> @test_pmulhw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_pmulhw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmulhw:
; HASWELL: # BB#0:
@@ -3975,9 +3975,9 @@ define <8 x i16> @test_pmullw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_pmullw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmullw:
; HASWELL: # BB#0:
@@ -4027,7 +4027,7 @@ define <2 x i64> @test_pmuludq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY: # BB#0:
; SANDY-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmuludq:
; HASWELL: # BB#0:
@@ -4073,9 +4073,9 @@ define <2 x i64> @test_por(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SANDY-LABEL: test_por:
; SANDY: # BB#0:
; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_por:
; HASWELL: # BB#0:
@@ -4126,9 +4126,9 @@ define <2 x i64> @test_psadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; SANDY-LABEL: test_psadbw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psadbw:
; HASWELL: # BB#0:
@@ -4176,9 +4176,9 @@ define <4 x i32> @test_pshufd(<4 x i32> %a0, <4 x i32> *%a1) {
; SANDY-LABEL: test_pshufd:
; SANDY: # BB#0:
; SANDY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.50]
-; SANDY-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:0.50]
+; SANDY-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pshufd:
; HASWELL: # BB#0:
@@ -4226,10 +4226,10 @@ define <8 x i16> @test_pshufhw(<8 x i16> %a0, <8 x i16> *%a1) {
;
; SANDY-LABEL: test_pshufhw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50]
-; SANDY-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [5:0.50]
-; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:1.00]
+; SANDY-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [7:0.50]
+; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pshufhw:
; HASWELL: # BB#0:
@@ -4278,9 +4278,9 @@ define <8 x i16> @test_pshuflw(<8 x i16> %a0, <8 x i16> *%a1) {
; SANDY-LABEL: test_pshuflw:
; SANDY: # BB#0:
; SANDY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50]
-; SANDY-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [5:0.50]
-; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [7:0.50]
+; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pshuflw:
; HASWELL: # BB#0:
@@ -4326,10 +4326,10 @@ define <4 x i32> @test_pslld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
;
; SANDY-LABEL: test_pslld:
; SANDY: # BB#0:
-; SANDY-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pslld:
; HASWELL: # BB#0:
@@ -4378,7 +4378,7 @@ define <4 x i32> @test_pslldq(<4 x i32> %a0) {
; SANDY-LABEL: test_pslldq:
; SANDY: # BB#0:
; SANDY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pslldq:
; HASWELL: # BB#0:
@@ -4417,10 +4417,10 @@ define <2 x i64> @test_psllq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
;
; SANDY-LABEL: test_psllq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psllq:
; HASWELL: # BB#0:
@@ -4468,10 +4468,10 @@ define <8 x i16> @test_psllw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_psllw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psllw:
; HASWELL: # BB#0:
@@ -4519,10 +4519,10 @@ define <4 x i32> @test_psrad(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
;
; SANDY-LABEL: test_psrad:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psrad:
; HASWELL: # BB#0:
@@ -4570,10 +4570,10 @@ define <8 x i16> @test_psraw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_psraw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psraw:
; HASWELL: # BB#0:
@@ -4621,10 +4621,10 @@ define <4 x i32> @test_psrld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
;
; SANDY-LABEL: test_psrld:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psrld:
; HASWELL: # BB#0:
@@ -4673,7 +4673,7 @@ define <4 x i32> @test_psrldq(<4 x i32> %a0) {
; SANDY-LABEL: test_psrldq:
; SANDY: # BB#0:
; SANDY-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psrldq:
; HASWELL: # BB#0:
@@ -4712,10 +4712,10 @@ define <2 x i64> @test_psrlq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
;
; SANDY-LABEL: test_psrlq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psrlq:
; HASWELL: # BB#0:
@@ -4763,10 +4763,10 @@ define <8 x i16> @test_psrlw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_psrlw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psrlw:
; HASWELL: # BB#0:
@@ -4816,8 +4816,8 @@ define <16 x i8> @test_psubb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_psubb:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubb:
; HASWELL: # BB#0:
@@ -4862,8 +4862,8 @@ define <4 x i32> @test_psubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_psubd:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubd:
; HASWELL: # BB#0:
@@ -4904,8 +4904,8 @@ define <2 x i64> @test_psubq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SANDY-LABEL: test_psubq:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubq:
; HASWELL: # BB#0:
@@ -4950,8 +4950,8 @@ define <16 x i8> @test_psubsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_psubsb:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubsb:
; HASWELL: # BB#0:
@@ -4997,8 +4997,8 @@ define <8 x i16> @test_psubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_psubsw:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubsw:
; HASWELL: # BB#0:
@@ -5044,8 +5044,8 @@ define <16 x i8> @test_psubusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_psubusb:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubusb:
; HASWELL: # BB#0:
@@ -5091,8 +5091,8 @@ define <8 x i16> @test_psubusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_psubusw:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubusw:
; HASWELL: # BB#0:
@@ -5138,8 +5138,8 @@ define <8 x i16> @test_psubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_psubw:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubw:
; HASWELL: # BB#0:
@@ -5184,8 +5184,8 @@ define <16 x i8> @test_punpckhbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_punpckhbw:
; SANDY: # BB#0:
; SANDY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50]
-; SANDY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpckhbw:
; HASWELL: # BB#0:
@@ -5231,9 +5231,9 @@ define <4 x i32> @test_punpckhdq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_punpckhdq:
; SANDY: # BB#0:
; SANDY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
-; SANDY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [5:0.50]
+; SANDY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpckhdq:
; HASWELL: # BB#0:
@@ -5279,10 +5279,10 @@ define <2 x i64> @test_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
;
; SANDY-LABEL: test_punpckhqdq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50]
-; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:0.50]
+; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
+; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpckhqdq:
; HASWELL: # BB#0:
@@ -5330,8 +5330,8 @@ define <8 x i16> @test_punpckhwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_punpckhwd:
; SANDY: # BB#0:
; SANDY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
-; SANDY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpckhwd:
; HASWELL: # BB#0:
@@ -5375,9 +5375,9 @@ define <16 x i8> @test_punpcklbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; SANDY-LABEL: test_punpcklbw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
-; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
+; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpcklbw:
; HASWELL: # BB#0:
@@ -5423,9 +5423,9 @@ define <4 x i32> @test_punpckldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_punpckldq:
; SANDY: # BB#0:
; SANDY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
-; SANDY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [5:0.50]
+; SANDY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpckldq:
; HASWELL: # BB#0:
@@ -5472,9 +5472,9 @@ define <2 x i64> @test_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
; SANDY-LABEL: test_punpcklqdq:
; SANDY: # BB#0:
; SANDY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
-; SANDY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:0.50]
+; SANDY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpcklqdq:
; HASWELL: # BB#0:
@@ -5522,8 +5522,8 @@ define <8 x i16> @test_punpcklwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_punpcklwd:
; SANDY: # BB#0:
; SANDY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
-; SANDY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpcklwd:
; HASWELL: # BB#0:
@@ -5567,9 +5567,9 @@ define <2 x i64> @test_pxor(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SANDY-LABEL: test_pxor:
; SANDY: # BB#0:
; SANDY-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pxor:
; HASWELL: # BB#0:
@@ -5616,9 +5616,9 @@ define <2 x double> @test_shufpd(<2 x double> %a0, <2 x double> %a1, <2 x double
; SANDY-LABEL: test_shufpd:
; SANDY: # BB#0:
; SANDY-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
-; SANDY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [5:1.00]
+; SANDY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_shufpd:
; HASWELL: # BB#0:
@@ -5665,10 +5665,10 @@ define <2 x double> @test_sqrtpd(<2 x double> %a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_sqrtpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [15:1.00]
-; SANDY-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [19:1.00]
+; SANDY-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [22:1.00]
+; SANDY-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [28:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtpd:
; HASWELL: # BB#0:
@@ -5720,11 +5720,11 @@ define <2 x double> @test_sqrtsd(<2 x double> %a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_sqrtsd:
; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [19:1.00]
-; SANDY-NEXT: vmovapd (%rdi), %xmm1 # sched: [4:0.50]
-; SANDY-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [19:1.00]
+; SANDY-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [21:1.00]
+; SANDY-NEXT: vmovapd (%rdi), %xmm1 # sched: [6:0.50]
+; SANDY-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [21:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtsd:
; HASWELL: # BB#0:
@@ -5771,8 +5771,8 @@ define <2 x double> @test_subpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_subpd:
; SANDY: # BB#0:
; SANDY-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subpd:
; HASWELL: # BB#0:
@@ -5813,8 +5813,8 @@ define double @test_subsd(double %a0, double %a1, double *%a2) {
; SANDY-LABEL: test_subsd:
; SANDY: # BB#0:
; SANDY-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subsd:
; HASWELL: # BB#0:
@@ -5879,16 +5879,16 @@ define i32 @test_ucomisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2)
; SANDY-LABEL: test_ucomisd:
; SANDY: # BB#0:
; SANDY-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %cl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %cl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
; SANDY-NEXT: vucomisd (%rdi), %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %dl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %dl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %dl # sched: [1:0.33]
; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33]
; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_ucomisd:
; HASWELL: # BB#0:
@@ -5950,9 +5950,9 @@ define <2 x double> @test_unpckhpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; SANDY-LABEL: test_unpckhpd:
; SANDY: # BB#0:
; SANDY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; SANDY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00]
+; SANDY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpckhpd:
; HASWELL: # BB#0:
@@ -6005,9 +6005,9 @@ define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; SANDY-LABEL: test_unpcklpd:
; SANDY: # BB#0:
; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; SANDY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [5:1.00]
+; SANDY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpcklpd:
; HASWELL: # BB#0:
@@ -6053,10 +6053,10 @@ define <2 x double> @test_xorpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
;
; SANDY-LABEL: test_xorpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_xorpd:
; HASWELL: # BB#0:
diff --git a/test/CodeGen/X86/sse3-schedule.ll b/test/CodeGen/X86/sse3-schedule.ll
index 482b2fcab642..ef1ddae4532d 100644
--- a/test/CodeGen/X86/sse3-schedule.ll
+++ b/test/CodeGen/X86/sse3-schedule.ll
@@ -31,8 +31,8 @@ define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; SANDY-LABEL: test_addsubpd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addsubpd:
; HASWELL: # BB#0:
@@ -74,8 +74,8 @@ define <4 x float> @test_addsubps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; SANDY-LABEL: test_addsubps:
; SANDY: # BB#0:
; SANDY-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addsubps:
; HASWELL: # BB#0:
@@ -116,9 +116,9 @@ define <2 x double> @test_haddpd(<2 x double> %a0, <2 x double> %a1, <2 x double
;
; SANDY-LABEL: test_haddpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_haddpd:
; HASWELL: # BB#0:
@@ -159,9 +159,9 @@ define <4 x float> @test_haddps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
;
; SANDY-LABEL: test_haddps:
; SANDY: # BB#0:
-; SANDY-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_haddps:
; HASWELL: # BB#0:
@@ -202,9 +202,9 @@ define <2 x double> @test_hsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double
;
; SANDY-LABEL: test_hsubpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_hsubpd:
; HASWELL: # BB#0:
@@ -245,9 +245,9 @@ define <4 x float> @test_hsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
;
; SANDY-LABEL: test_hsubps:
; SANDY: # BB#0:
-; SANDY-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_hsubps:
; HASWELL: # BB#0:
@@ -287,8 +287,8 @@ define <16 x i8> @test_lddqu(i8* %a0) {
;
; SANDY-LABEL: test_lddqu:
; SANDY: # BB#0:
-; SANDY-NEXT: vlddqu (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vlddqu (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_lddqu:
; HASWELL: # BB#0:
@@ -330,9 +330,9 @@ define <2 x double> @test_movddup(<2 x double> %a0, <2 x double> *%a1) {
; SANDY-LABEL: test_movddup:
; SANDY: # BB#0:
; SANDY-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00]
-; SANDY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [4:0.50]
+; SANDY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [6:0.50]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movddup:
; HASWELL: # BB#0:
@@ -380,9 +380,9 @@ define <4 x float> @test_movshdup(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_movshdup:
; SANDY: # BB#0:
; SANDY-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00]
-; SANDY-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [4:0.50]
+; SANDY-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [6:0.50]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movshdup:
; HASWELL: # BB#0:
@@ -430,9 +430,9 @@ define <4 x float> @test_movsldup(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_movsldup:
; SANDY: # BB#0:
; SANDY-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00]
-; SANDY-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [4:0.50]
+; SANDY-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [6:0.50]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movsldup:
; HASWELL: # BB#0:
diff --git a/test/CodeGen/X86/sse41-schedule.ll b/test/CodeGen/X86/sse41-schedule.ll
index 340b9abe8879..1ab1598fcab7 100644
--- a/test/CodeGen/X86/sse41-schedule.ll
+++ b/test/CodeGen/X86/sse41-schedule.ll
@@ -25,10 +25,10 @@ define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
;
; SANDY-LABEL: test_blendpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.50]
+; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendpd:
; HASWELL: # BB#0:
@@ -65,9 +65,9 @@ define <4 x float> @test_blendps(<4 x float> %a0, <4 x float> %a1, <4 x float> *
;
; SANDY-LABEL: test_blendps:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
-; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:1.00]
+; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendps:
; HASWELL: # BB#0:
@@ -107,9 +107,9 @@ define <2 x double> @test_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
;
; SANDY-LABEL: test_blendvpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; SANDY-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; SANDY-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendvpd:
; HASWELL: # BB#0:
@@ -150,9 +150,9 @@ define <4 x float> @test_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float>
;
; SANDY-LABEL: test_blendvps:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; SANDY-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; SANDY-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendvps:
; HASWELL: # BB#0:
@@ -187,9 +187,9 @@ define <2 x double> @test_dppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
;
; SANDY-LABEL: test_dppd:
; SANDY: # BB#0:
-; SANDY-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [15:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_dppd:
; HASWELL: # BB#0:
@@ -224,9 +224,9 @@ define <4 x float> @test_dpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
;
; SANDY-LABEL: test_dpps:
; SANDY: # BB#0:
-; SANDY-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [12:2.00]
; SANDY-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_dpps:
; HASWELL: # BB#0:
@@ -262,8 +262,8 @@ define <4 x float> @test_insertps(<4 x float> %a0, <4 x float> %a1, float *%a2)
; SANDY-LABEL: test_insertps:
; SANDY: # BB#0:
; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
-; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_insertps:
; HASWELL: # BB#0:
@@ -296,8 +296,8 @@ define <2 x i64> @test_movntdqa(i8* %a0) {
;
; SANDY-LABEL: test_movntdqa:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntdqa:
; HASWELL: # BB#0:
@@ -328,9 +328,9 @@ define <8 x i16> @test_mpsadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; SANDY-LABEL: test_mpsadbw:
; SANDY: # BB#0:
-; SANDY-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [6:1.00]
-; SANDY-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mpsadbw:
; HASWELL: # BB#0:
@@ -367,8 +367,8 @@ define <8 x i16> @test_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_packusdw:
; SANDY: # BB#0:
; SANDY-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_packusdw:
; HASWELL: # BB#0:
@@ -411,8 +411,8 @@ define <16 x i8> @test_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16
; SANDY-LABEL: test_pblendvb:
; SANDY: # BB#0:
; SANDY-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; SANDY-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pblendvb:
; HASWELL: # BB#0:
@@ -448,8 +448,8 @@ define <8 x i16> @test_pblendw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pblendw:
; SANDY: # BB#0:
; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
-; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pblendw:
; HASWELL: # BB#0:
@@ -483,9 +483,9 @@ define <2 x i64> @test_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
;
; SANDY-LABEL: test_pcmpeqq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpeqq:
; HASWELL: # BB#0:
@@ -521,9 +521,9 @@ define i32 @test_pextrb(<16 x i8> %a0, i8 *%a1) {
;
; SANDY-LABEL: test_pextrb:
; SANDY: # BB#0:
-; SANDY-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:0.50]
+; SANDY-NEXT: vpextrb $3, %xmm0, %eax # sched: [3:1.00]
; SANDY-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pextrb:
; HASWELL: # BB#0:
@@ -558,9 +558,9 @@ define i32 @test_pextrd(<4 x i32> %a0, i32 *%a1) {
;
; SANDY-LABEL: test_pextrd:
; SANDY: # BB#0:
-; SANDY-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.50]
+; SANDY-NEXT: vpextrd $3, %xmm0, %eax # sched: [3:1.00]
; SANDY-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pextrd:
; HASWELL: # BB#0:
@@ -594,9 +594,9 @@ define i64 @test_pextrq(<2 x i64> %a0, <2 x i64> %a1, i64 *%a2) {
;
; SANDY-LABEL: test_pextrq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:0.50]
+; SANDY-NEXT: vpextrq $1, %xmm0, %rax # sched: [3:1.00]
; SANDY-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pextrq:
; HASWELL: # BB#0:
@@ -630,9 +630,9 @@ define i32 @test_pextrw(<8 x i16> %a0, i16 *%a1) {
;
; SANDY-LABEL: test_pextrw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:0.50]
+; SANDY-NEXT: vpextrw $3, %xmm0, %eax # sched: [3:1.00]
; SANDY-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pextrw:
; HASWELL: # BB#0:
@@ -667,9 +667,9 @@ define <8 x i16> @test_phminposuw(<8 x i16> *%a0) {
;
; SANDY-LABEL: test_phminposuw:
; SANDY: # BB#0:
-; SANDY-NEXT: vphminposuw (%rdi), %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: vphminposuw (%rdi), %xmm0 # sched: [11:1.00]
; SANDY-NEXT: vphminposuw %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phminposuw:
; HASWELL: # BB#0:
@@ -704,9 +704,9 @@ define <16 x i8> @test_pinsrb(<16 x i8> %a0, i8 %a1, i8 *%a2) {
;
; SANDY-LABEL: test_pinsrb:
; SANDY: # BB#0:
-; SANDY-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pinsrb:
; HASWELL: # BB#0:
@@ -740,9 +740,9 @@ define <4 x i32> @test_pinsrd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
;
; SANDY-LABEL: test_pinsrd:
; SANDY: # BB#0:
-; SANDY-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pinsrd:
; HASWELL: # BB#0:
@@ -778,10 +778,10 @@ define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) {
;
; SANDY-LABEL: test_pinsrq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pinsrq:
; HASWELL: # BB#0:
@@ -819,8 +819,8 @@ define <16 x i8> @test_pmaxsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pmaxsb:
; SANDY: # BB#0:
; SANDY-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxsb:
; HASWELL: # BB#0:
@@ -856,8 +856,8 @@ define <4 x i32> @test_pmaxsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_pmaxsd:
; SANDY: # BB#0:
; SANDY-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxsd:
; HASWELL: # BB#0:
@@ -893,8 +893,8 @@ define <4 x i32> @test_pmaxud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_pmaxud:
; SANDY: # BB#0:
; SANDY-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxud:
; HASWELL: # BB#0:
@@ -930,8 +930,8 @@ define <8 x i16> @test_pmaxuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pmaxuw:
; SANDY: # BB#0:
; SANDY-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxuw:
; HASWELL: # BB#0:
@@ -967,8 +967,8 @@ define <16 x i8> @test_pminsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pminsb:
; SANDY: # BB#0:
; SANDY-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminsb:
; HASWELL: # BB#0:
@@ -1004,8 +1004,8 @@ define <4 x i32> @test_pminsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_pminsd:
; SANDY: # BB#0:
; SANDY-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminsd:
; HASWELL: # BB#0:
@@ -1041,8 +1041,8 @@ define <4 x i32> @test_pminud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_pminud:
; SANDY: # BB#0:
; SANDY-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminud:
; HASWELL: # BB#0:
@@ -1078,8 +1078,8 @@ define <8 x i16> @test_pminuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pminuw:
; SANDY: # BB#0:
; SANDY-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminuw:
; HASWELL: # BB#0:
@@ -1118,9 +1118,9 @@ define <8 x i16> @test_pmovsxbw(<16 x i8> %a0, <8 x i8> *%a1) {
; SANDY-LABEL: test_pmovsxbw:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [5:0.50]
-; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [7:0.50]
+; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxbw:
; HASWELL: # BB#0:
@@ -1162,9 +1162,9 @@ define <4 x i32> @test_pmovsxbd(<16 x i8> %a0, <4 x i8> *%a1) {
; SANDY-LABEL: test_pmovsxbd:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxbd:
; HASWELL: # BB#0:
@@ -1206,9 +1206,9 @@ define <2 x i64> @test_pmovsxbq(<16 x i8> %a0, <2 x i8> *%a1) {
; SANDY-LABEL: test_pmovsxbq:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxbq:
; HASWELL: # BB#0:
@@ -1250,9 +1250,9 @@ define <2 x i64> @test_pmovsxdq(<4 x i32> %a0, <2 x i32> *%a1) {
; SANDY-LABEL: test_pmovsxdq:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxdq:
; HASWELL: # BB#0:
@@ -1294,9 +1294,9 @@ define <4 x i32> @test_pmovsxwd(<8 x i16> %a0, <4 x i16> *%a1) {
; SANDY-LABEL: test_pmovsxwd:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxwd:
; HASWELL: # BB#0:
@@ -1338,9 +1338,9 @@ define <2 x i64> @test_pmovsxwq(<8 x i16> %a0, <2 x i16> *%a1) {
; SANDY-LABEL: test_pmovsxwq:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxwq:
; HASWELL: # BB#0:
@@ -1382,9 +1382,9 @@ define <8 x i16> @test_pmovzxbw(<16 x i8> %a0, <8 x i8> *%a1) {
; SANDY-LABEL: test_pmovzxbw:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:0.50]
-; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [7:0.50]
+; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxbw:
; HASWELL: # BB#0:
@@ -1426,9 +1426,9 @@ define <4 x i32> @test_pmovzxbd(<16 x i8> %a0, <4 x i8> *%a1) {
; SANDY-LABEL: test_pmovzxbd:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxbd:
; HASWELL: # BB#0:
@@ -1470,9 +1470,9 @@ define <2 x i64> @test_pmovzxbq(<16 x i8> %a0, <2 x i8> *%a1) {
; SANDY-LABEL: test_pmovzxbq:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxbq:
; HASWELL: # BB#0:
@@ -1514,9 +1514,9 @@ define <2 x i64> @test_pmovzxdq(<4 x i32> %a0, <2 x i32> *%a1) {
; SANDY-LABEL: test_pmovzxdq:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxdq:
; HASWELL: # BB#0:
@@ -1558,9 +1558,9 @@ define <4 x i32> @test_pmovzxwd(<8 x i16> %a0, <4 x i16> *%a1) {
; SANDY-LABEL: test_pmovzxwd:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxwd:
; HASWELL: # BB#0:
@@ -1602,9 +1602,9 @@ define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) {
; SANDY-LABEL: test_pmovzxwq:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxwq:
; HASWELL: # BB#0:
@@ -1642,9 +1642,9 @@ define <2 x i64> @test_pmuldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
;
; SANDY-LABEL: test_pmuldq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmuldq:
; HASWELL: # BB#0:
@@ -1680,9 +1680,9 @@ define <4 x i32> @test_pmulld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
;
; SANDY-LABEL: test_pmulld:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmulld:
; HASWELL: # BB#0:
@@ -1724,13 +1724,13 @@ define i32 @test_ptest(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
;
; SANDY-LABEL: test_ptest:
; SANDY: # BB#0:
-; SANDY-NEXT: vptest %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: setb %al # sched: [1:0.33]
-; SANDY-NEXT: vptest (%rdi), %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: setb %cl # sched: [1:0.33]
+; SANDY-NEXT: vptest %xmm1, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: setb %al # sched: [1:1.00]
+; SANDY-NEXT: vptest (%rdi), %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: setb %cl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
; SANDY-NEXT: movzbl %cl, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_ptest:
; HASWELL: # BB#0:
@@ -1778,9 +1778,9 @@ define <2 x double> @test_roundpd(<2 x double> %a0, <2 x double> *%a1) {
; SANDY-LABEL: test_roundpd:
; SANDY: # BB#0:
; SANDY-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundpd:
; HASWELL: # BB#0:
@@ -1822,9 +1822,9 @@ define <4 x float> @test_roundps(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_roundps:
; SANDY: # BB#0:
; SANDY-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundps:
; HASWELL: # BB#0:
@@ -1867,9 +1867,9 @@ define <2 x double> @test_roundsd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
; SANDY-LABEL: test_roundsd:
; SANDY: # BB#0:
; SANDY-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundsd:
; HASWELL: # BB#0:
@@ -1912,9 +1912,9 @@ define <4 x float> @test_roundss(<4 x float> %a0, <4 x float> %a1, <4 x float> *
; SANDY-LABEL: test_roundss:
; SANDY: # BB#0:
; SANDY-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundss:
; HASWELL: # BB#0:
diff --git a/test/CodeGen/X86/sse42-schedule.ll b/test/CodeGen/X86/sse42-schedule.ll
index afc48bc57ee7..7ce9ffdbd0ea 100644
--- a/test/CodeGen/X86/sse42-schedule.ll
+++ b/test/CodeGen/X86/sse42-schedule.ll
@@ -26,9 +26,9 @@ define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) {
; SANDY-LABEL: crc32_32_8:
; SANDY: # BB#0:
; SANDY-NEXT: crc32b %sil, %edi # sched: [3:1.00]
-; SANDY-NEXT: crc32b (%rdx), %edi # sched: [7:1.00]
+; SANDY-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: crc32_32_8:
; HASWELL: # BB#0:
@@ -68,9 +68,9 @@ define i32 @crc32_32_16(i32 %a0, i16 %a1, i16 *%a2) {
; SANDY-LABEL: crc32_32_16:
; SANDY: # BB#0:
; SANDY-NEXT: crc32w %si, %edi # sched: [3:1.00]
-; SANDY-NEXT: crc32w (%rdx), %edi # sched: [7:1.00]
+; SANDY-NEXT: crc32w (%rdx), %edi # sched: [8:1.00]
; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: crc32_32_16:
; HASWELL: # BB#0:
@@ -112,7 +112,7 @@ define i32 @crc32_32_32(i32 %a0, i32 %a1, i32 *%a2) {
; SANDY-NEXT: crc32l %esi, %edi # sched: [3:1.00]
; SANDY-NEXT: crc32l (%rdx), %edi # sched: [7:1.00]
; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: crc32_32_32:
; HASWELL: # BB#0:
@@ -152,9 +152,9 @@ define i64 @crc32_64_8(i64 %a0, i8 %a1, i8 *%a2) nounwind {
; SANDY-LABEL: crc32_64_8:
; SANDY: # BB#0:
; SANDY-NEXT: crc32b %sil, %edi # sched: [3:1.00]
-; SANDY-NEXT: crc32b (%rdx), %edi # sched: [7:1.00]
+; SANDY-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: crc32_64_8:
; HASWELL: # BB#0:
@@ -196,7 +196,7 @@ define i64 @crc32_64_64(i64 %a0, i64 %a1, i64 *%a2) {
; SANDY-NEXT: crc32q %rsi, %rdi # sched: [3:1.00]
; SANDY-NEXT: crc32q (%rdx), %rdi # sched: [7:1.00]
; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: crc32_64_64:
; HASWELL: # BB#0:
@@ -256,7 +256,7 @@ define i32 @test_pcmpestri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [4:2.33]
; SANDY-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
; SANDY-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpestri:
; HASWELL: # BB#0:
@@ -320,7 +320,7 @@ define <16 x i8> @test_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-NEXT: movl $7, %eax # sched: [1:0.33]
; SANDY-NEXT: movl $7, %edx # sched: [1:0.33]
; SANDY-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [11:2.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpestrm:
; HASWELL: # BB#0:
@@ -369,12 +369,12 @@ define i32 @test_pcmpistri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; SANDY-LABEL: test_pcmpistri:
; SANDY: # BB#0:
-; SANDY-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [11:3.00]
; SANDY-NEXT: movl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [17:3.00]
; SANDY-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
; SANDY-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpistri:
; HASWELL: # BB#0:
@@ -416,9 +416,9 @@ define <16 x i8> @test_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; SANDY-LABEL: test_pcmpistrm:
; SANDY: # BB#0:
-; SANDY-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:1.00]
-; SANDY-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [11:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:3.00]
+; SANDY-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [17:3.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpistrm:
; HASWELL: # BB#0:
@@ -453,9 +453,9 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
;
; SANDY-LABEL: test_pcmpgtq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpgtq:
; HASWELL: # BB#0:
diff --git a/test/CodeGen/X86/sse4a-schedule.ll b/test/CodeGen/X86/sse4a-schedule.ll
new file mode 100644
index 000000000000..11afdb7989f1
--- /dev/null
+++ b/test/CodeGen/X86/sse4a-schedule.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse4a | FileCheck %s --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=BTVER2
+
+define <2 x i64> @test_extrq(<2 x i64> %a0, <16 x i8> %a1) {
+; GENERIC-LABEL: test_extrq:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: extrq %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; BTVER2-LABEL: test_extrq:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: extrq %xmm1, %xmm0
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %a0, <16 x i8> %a1)
+ ret <2 x i64> %1
+}
+declare <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64>, <16 x i8>)
+
+define <2 x i64> @test_extrqi(<2 x i64> %a0) {
+; GENERIC-LABEL: test_extrqi:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: extrq $2, $3, %xmm0
+; GENERIC-NEXT: retq
+;
+; BTVER2-LABEL: test_extrqi:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: extrq $2, $3, %xmm0
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a0, i8 3, i8 2)
+ ret <2 x i64> %1
+}
+declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8)
+
+define <2 x i64> @test_insertq(<2 x i64> %a0, <2 x i64> %a1) {
+; GENERIC-LABEL: test_insertq:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: insertq %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; BTVER2-LABEL: test_insertq:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: insertq %xmm1, %xmm0
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %a0, <2 x i64> %a1)
+ ret <2 x i64> %1
+}
+declare <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_insertqi(<2 x i64> %a0, <2 x i64> %a1) {
+; GENERIC-LABEL: test_insertqi:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: insertq $6, $5, %xmm1, %xmm0
+; GENERIC-NEXT: retq
+;
+; BTVER2-LABEL: test_insertqi:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: insertq $6, $5, %xmm1, %xmm0
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 6)
+ ret <2 x i64> %1
+}
+declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8)
+
+define void @test_movntsd(i8* %p, <2 x double> %a) {
+; GENERIC-LABEL: test_movntsd:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movntsd %xmm0, (%rdi)
+; GENERIC-NEXT: retq
+;
+; BTVER2-LABEL: test_movntsd:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: movntsd %xmm0, (%rdi) # sched: [1:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a)
+ ret void
+}
+declare void @llvm.x86.sse4a.movnt.sd(i8*, <2 x double>)
+
+define void @test_movntss(i8* %p, <4 x float> %a) {
+; GENERIC-LABEL: test_movntss:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movntss %xmm0, (%rdi)
+; GENERIC-NEXT: retq
+;
+; BTVER2-LABEL: test_movntss:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: movntss %xmm0, (%rdi) # sched: [1:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+ tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a)
+ ret void
+}
+declare void @llvm.x86.sse4a.movnt.ss(i8*, <4 x float>)
+
diff --git a/test/CodeGen/X86/ssse3-schedule.ll b/test/CodeGen/X86/ssse3-schedule.ll
index 8b7a0c0ec02b..f24969a30c33 100644
--- a/test/CodeGen/X86/ssse3-schedule.ll
+++ b/test/CodeGen/X86/ssse3-schedule.ll
@@ -35,9 +35,9 @@ define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) {
; SANDY-LABEL: test_pabsb:
; SANDY: # BB#0:
; SANDY-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpabsb (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpabsb (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pabsb:
; HASWELL: # BB#0:
@@ -86,9 +86,9 @@ define <4 x i32> @test_pabsd(<4 x i32> %a0, <4 x i32> *%a1) {
; SANDY-LABEL: test_pabsd:
; SANDY: # BB#0:
; SANDY-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpabsd (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpabsd (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pabsd:
; HASWELL: # BB#0:
@@ -136,7 +136,7 @@ define <8 x i16> @test_pabsw(<8 x i16> %a0, <8 x i16> *%a1) {
; SANDY-LABEL: test_pabsw:
; SANDY: # BB#0:
; SANDY-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pabsw:
; HASWELL: # BB#0:
@@ -182,8 +182,8 @@ define <8 x i16> @test_palignr(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_palignr:
; SANDY: # BB#0:
; SANDY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50]
-; SANDY-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_palignr:
; HASWELL: # BB#0:
@@ -223,9 +223,9 @@ define <4 x i32> @test_phaddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
;
; SANDY-LABEL: test_phaddd:
; SANDY: # BB#0:
-; SANDY-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phaddd:
; HASWELL: # BB#0:
@@ -274,9 +274,9 @@ define <8 x i16> @test_phaddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_phaddsw:
; SANDY: # BB#0:
-; SANDY-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phaddsw:
; HASWELL: # BB#0:
@@ -317,9 +317,9 @@ define <8 x i16> @test_phaddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_phaddw:
; SANDY: # BB#0:
-; SANDY-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phaddw:
; HASWELL: # BB#0:
@@ -360,9 +360,9 @@ define <4 x i32> @test_phsubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
;
; SANDY-LABEL: test_phsubd:
; SANDY: # BB#0:
-; SANDY-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phsubd:
; HASWELL: # BB#0:
@@ -411,9 +411,9 @@ define <8 x i16> @test_phsubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_phsubsw:
; SANDY: # BB#0:
-; SANDY-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phsubsw:
; HASWELL: # BB#0:
@@ -454,9 +454,9 @@ define <8 x i16> @test_phsubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_phsubw:
; SANDY: # BB#0:
-; SANDY-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phsubw:
; HASWELL: # BB#0:
@@ -497,9 +497,9 @@ define <8 x i16> @test_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; SANDY-LABEL: test_pmaddubsw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaddubsw:
; HASWELL: # BB#0:
@@ -538,8 +538,8 @@ define <8 x i16> @test_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_pmulhrsw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmulhrsw:
; HASWELL: # BB#0:
@@ -579,8 +579,8 @@ define <16 x i8> @test_pshufb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pshufb:
; SANDY: # BB#0:
; SANDY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pshufb:
; HASWELL: # BB#0:
@@ -630,8 +630,8 @@ define <16 x i8> @test_psignb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_psignb:
; SANDY: # BB#0:
; SANDY-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psignb:
; HASWELL: # BB#0:
@@ -681,8 +681,8 @@ define <4 x i32> @test_psignd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_psignd:
; SANDY: # BB#0:
; SANDY-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psignd:
; HASWELL: # BB#0:
@@ -732,8 +732,8 @@ define <8 x i16> @test_psignw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_psignw:
; SANDY: # BB#0:
; SANDY-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psignw:
; HASWELL: # BB#0:
diff --git a/test/CodeGen/X86/swizzle-avx2.ll b/test/CodeGen/X86/swizzle-avx2.ll
index 29dfa6c2dcc1..6ca9126eb09d 100644
--- a/test/CodeGen/X86/swizzle-avx2.ll
+++ b/test/CodeGen/X86/swizzle-avx2.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s
; Test that we correctly fold a shuffle that performs a swizzle of another
; shuffle node according to the rule
@@ -11,81 +12,77 @@
; Check that we produce a single vector permute / shuffle in all cases.
define <8 x i32> @swizzle_1(<8 x i32> %v) {
+; CHECK-LABEL: swizzle_1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,2,0,4,5,6,7]
+; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 7, i32 5, i32 6, i32 4>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 7, i32 5, i32 6, i32 4>
ret <8 x i32> %2
}
-; CHECK-LABEL: swizzle_1
-; CHECK: vpermd
-; CHECK-NOT: vpermd
-; CHECK: ret
-
define <8 x i32> @swizzle_2(<8 x i32> %v) {
+; CHECK-LABEL: swizzle_2:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>
ret <8 x i32> %2
}
-; CHECK-LABEL: swizzle_2
-; CHECK: vpshufd $78
-; CHECK-NOT: vpermd
-; CHECK-NOT: vpshufd
-; CHECK: ret
-
define <8 x i32> @swizzle_3(<8 x i32> %v) {
+; CHECK-LABEL: swizzle_3:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
ret <8 x i32> %2
}
-; CHECK-LABEL: swizzle_3
-; CHECK: vpshufd $78
-; CHECK-NOT: vpermd
-; CHECK-NOT: vpshufd
-; CHECK: ret
-
define <8 x i32> @swizzle_4(<8 x i32> %v) {
+; CHECK-LABEL: swizzle_4:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,1,2,0,6,5,4,7]
+; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 4, i32 7, i32 5, i32 6, i32 3, i32 2, i32 0, i32 1>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 4, i32 7, i32 5, i32 6, i32 3, i32 2, i32 0, i32 1>
ret <8 x i32> %2
}
-; CHECK-LABEL: swizzle_4
-; CHECK: vpermd
-; CHECK-NOT: vpermd
-; CHECK: ret
-
define <8 x i32> @swizzle_5(<8 x i32> %v) {
+; CHECK-LABEL: swizzle_5:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,0,1,2,7,6,4,5]
+; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 0, i32 2, i32 1, i32 3>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 0, i32 2, i32 1, i32 3>
ret <8 x i32> %2
}
-; CHECK-LABEL: swizzle_5
-; CHECK: vpermd
-; CHECK-NOT: vpermd
-; CHECK: ret
-
define <8 x i32> @swizzle_6(<8 x i32> %v) {
+; CHECK-LABEL: swizzle_6:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,1,0,2,4,5,6,7]
+; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 0, i32 4, i32 7, i32 6, i32 5>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 0, i32 4, i32 7, i32 6, i32 5>
ret <8 x i32> %2
}
-; CHECK-LABEL: swizzle_6
-; CHECK: vpermd
-; CHECK-NOT: vpermd
-; CHECK: ret
-
define <8 x i32> @swizzle_7(<8 x i32> %v) {
+; CHECK-LABEL: swizzle_7:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,3,1,4,5,6,7]
+; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 1, i32 2, i32 5, i32 4, i32 6, i32 7>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 1, i32 2, i32 5, i32 4, i32 6, i32 7>
ret <8 x i32> %2
}
-; CHECK-LABEL: swizzle_7
-; CHECK: vpermd
-; CHECK-NOT: vpermd
-; CHECK: ret
-
diff --git a/test/CodeGen/X86/tbm_patterns.ll b/test/CodeGen/X86/tbm_patterns.ll
index 80d36d5af4d2..5ce6bbd4b49e 100644
--- a/test/CodeGen/X86/tbm_patterns.ll
+++ b/test/CodeGen/X86/tbm_patterns.ll
@@ -1,253 +1,255 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+tbm < %s | FileCheck %s
-define i32 @test_x86_tbm_bextri_u32(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_bextri_u32:
- ; CHECK-NOT: mov
- ; CHECK: bextr $
- %0 = lshr i32 %a, 4
- %1 = and i32 %0, 4095
- ret i32 %1
-}
-
-define i32 @test_x86_tbm_bextri_u32_m(i32* nocapture %a) nounwind readonly {
-entry:
- ; CHECK-LABEL: test_x86_tbm_bextri_u32_m:
- ; CHECK-NOT: mov
- ; CHECK: bextr $
- %0 = load i32, i32* %a
- %1 = lshr i32 %0, 4
- %2 = and i32 %1, 4095
- ret i32 %2
-}
-
-define i64 @test_x86_tbm_bextri_u64(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_bextri_u64:
- ; CHECK-NOT: mov
- ; CHECK: bextr $
- %0 = lshr i64 %a, 4
- %1 = and i64 %0, 4095
- ret i64 %1
-}
-
-define i64 @test_x86_tbm_bextri_u64_m(i64* nocapture %a) nounwind readonly {
-entry:
- ; CHECK-LABEL: test_x86_tbm_bextri_u64_m:
- ; CHECK-NOT: mov
- ; CHECK: bextr $
- %0 = load i64, i64* %a
- %1 = lshr i64 %0, 4
- %2 = and i64 %1, 4095
- ret i64 %2
-}
-
-define i32 @test_x86_tbm_blcfill_u32(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blcfill_u32:
- ; CHECK-NOT: mov
- ; CHECK: blcfill %
- %0 = add i32 %a, 1
- %1 = and i32 %0, %a
- ret i32 %1
-}
-
-define i64 @test_x86_tbm_blcfill_u64(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blcfill_u64:
- ; CHECK-NOT: mov
- ; CHECK: blcfill %
- %0 = add i64 %a, 1
- %1 = and i64 %0, %a
- ret i64 %1
-}
-
-define i32 @test_x86_tbm_blci_u32(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blci_u32:
- ; CHECK-NOT: mov
- ; CHECK: blci %
- %0 = add i32 1, %a
- %1 = xor i32 %0, -1
- %2 = or i32 %1, %a
- ret i32 %2
-}
-
-define i64 @test_x86_tbm_blci_u64(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blci_u64:
- ; CHECK-NOT: mov
- ; CHECK: blci %
- %0 = add i64 1, %a
- %1 = xor i64 %0, -1
- %2 = or i64 %1, %a
- ret i64 %2
-}
-
-define i32 @test_x86_tbm_blci_u32_b(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blci_u32_b:
- ; CHECK-NOT: mov
- ; CHECK: blci %
- %0 = sub i32 -2, %a
- %1 = or i32 %0, %a
- ret i32 %1
-}
-
-define i64 @test_x86_tbm_blci_u64_b(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blci_u64_b:
- ; CHECK-NOT: mov
- ; CHECK: blci %
- %0 = sub i64 -2, %a
- %1 = or i64 %0, %a
- ret i64 %1
-}
-
-define i32 @test_x86_tbm_blcic_u32(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blcic_u32:
- ; CHECK-NOT: mov
- ; CHECK: blcic %
- %0 = xor i32 %a, -1
- %1 = add i32 %a, 1
- %2 = and i32 %1, %0
- ret i32 %2
-}
-
-define i64 @test_x86_tbm_blcic_u64(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blcic_u64:
- ; CHECK-NOT: mov
- ; CHECK: blcic %
- %0 = xor i64 %a, -1
- %1 = add i64 %a, 1
- %2 = and i64 %1, %0
- ret i64 %2
-}
-
-define i32 @test_x86_tbm_blcmsk_u32(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blcmsk_u32:
- ; CHECK-NOT: mov
- ; CHECK: blcmsk %
- %0 = add i32 %a, 1
- %1 = xor i32 %0, %a
- ret i32 %1
-}
-
-define i64 @test_x86_tbm_blcmsk_u64(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blcmsk_u64:
- ; CHECK-NOT: mov
- ; CHECK: blcmsk %
- %0 = add i64 %a, 1
- %1 = xor i64 %0, %a
- ret i64 %1
-}
-
-define i32 @test_x86_tbm_blcs_u32(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blcs_u32:
- ; CHECK-NOT: mov
- ; CHECK: blcs %
- %0 = add i32 %a, 1
- %1 = or i32 %0, %a
- ret i32 %1
-}
-
-define i64 @test_x86_tbm_blcs_u64(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blcs_u64:
- ; CHECK-NOT: mov
- ; CHECK: blcs %
- %0 = add i64 %a, 1
- %1 = or i64 %0, %a
- ret i64 %1
-}
-
-define i32 @test_x86_tbm_blsfill_u32(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blsfill_u32:
- ; CHECK-NOT: mov
- ; CHECK: blsfill %
- %0 = add i32 %a, -1
- %1 = or i32 %0, %a
- ret i32 %1
-}
-
-define i64 @test_x86_tbm_blsfill_u64(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blsfill_u64:
- ; CHECK-NOT: mov
- ; CHECK: blsfill %
- %0 = add i64 %a, -1
- %1 = or i64 %0, %a
- ret i64 %1
-}
-
-define i32 @test_x86_tbm_blsic_u32(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blsic_u32:
- ; CHECK-NOT: mov
- ; CHECK: blsic %
- %0 = xor i32 %a, -1
- %1 = add i32 %a, -1
- %2 = or i32 %0, %1
- ret i32 %2
-}
-
-define i64 @test_x86_tbm_blsic_u64(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_blsic_u64:
- ; CHECK-NOT: mov
- ; CHECK: blsic %
- %0 = xor i64 %a, -1
- %1 = add i64 %a, -1
- %2 = or i64 %0, %1
- ret i64 %2
-}
-
-define i32 @test_x86_tbm_t1mskc_u32(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_t1mskc_u32:
- ; CHECK-NOT: mov
- ; CHECK: t1mskc %
- %0 = xor i32 %a, -1
- %1 = add i32 %a, 1
- %2 = or i32 %0, %1
- ret i32 %2
-}
-
-define i64 @Ttest_x86_tbm_t1mskc_u64(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_t1mskc_u64:
- ; CHECK-NOT: mov
- ; CHECK: t1mskc %
- %0 = xor i64 %a, -1
- %1 = add i64 %a, 1
- %2 = or i64 %0, %1
- ret i64 %2
-}
-
-define i32 @test_x86_tbm_tzmsk_u32(i32 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_tzmsk_u32:
- ; CHECK-NOT: mov
- ; CHECK: tzmsk %
- %0 = xor i32 %a, -1
- %1 = add i32 %a, -1
- %2 = and i32 %0, %1
- ret i32 %2
-}
-
-define i64 @test_x86_tbm_tzmsk_u64(i64 %a) nounwind readnone {
-entry:
- ; CHECK-LABEL: test_x86_tbm_tzmsk_u64:
- ; CHECK-NOT: mov
- ; CHECK: tzmsk %
- %0 = xor i64 %a, -1
- %1 = add i64 %a, -1
- %2 = and i64 %0, %1
- ret i64 %2
+define i32 @test_x86_tbm_bextri_u32(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_bextri_u32:
+; CHECK: # BB#0:
+; CHECK-NEXT: bextr $3076, %edi, %eax # imm = 0xC04
+; CHECK-NEXT: retq
+ %t0 = lshr i32 %a, 4
+ %t1 = and i32 %t0, 4095
+ ret i32 %t1
+}
+
+define i32 @test_x86_tbm_bextri_u32_m(i32* nocapture %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_bextri_u32_m:
+; CHECK: # BB#0:
+; CHECK-NEXT: bextr $3076, (%rdi), %eax # imm = 0xC04
+; CHECK-NEXT: retq
+ %t0 = load i32, i32* %a
+ %t1 = lshr i32 %t0, 4
+ %t2 = and i32 %t1, 4095
+ ret i32 %t2
+}
+
+define i64 @test_x86_tbm_bextri_u64(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_bextri_u64:
+; CHECK: # BB#0:
+; CHECK-NEXT: bextr $3076, %edi, %eax # imm = 0xC04
+; CHECK-NEXT: retq
+ %t0 = lshr i64 %a, 4
+ %t1 = and i64 %t0, 4095
+ ret i64 %t1
+}
+
+define i64 @test_x86_tbm_bextri_u64_m(i64* nocapture %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_bextri_u64_m:
+; CHECK: # BB#0:
+; CHECK-NEXT: bextr $3076, (%rdi), %eax # imm = 0xC04
+; CHECK-NEXT: retq
+ %t0 = load i64, i64* %a
+ %t1 = lshr i64 %t0, 4
+ %t2 = and i64 %t1, 4095
+ ret i64 %t2
+}
+
+define i32 @test_x86_tbm_blcfill_u32(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcfill_u32:
+; CHECK: # BB#0:
+; CHECK-NEXT: blcfill %edi, %eax
+; CHECK-NEXT: retq
+ %t0 = add i32 %a, 1
+ %t1 = and i32 %t0, %a
+ ret i32 %t1
+}
+
+define i64 @test_x86_tbm_blcfill_u64(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcfill_u64:
+; CHECK: # BB#0:
+; CHECK-NEXT: blcfill %rdi, %rax
+; CHECK-NEXT: retq
+ %t0 = add i64 %a, 1
+ %t1 = and i64 %t0, %a
+ ret i64 %t1
+}
+
+define i32 @test_x86_tbm_blci_u32(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blci_u32:
+; CHECK: # BB#0:
+; CHECK-NEXT: blci %edi, %eax
+; CHECK-NEXT: retq
+ %t0 = add i32 1, %a
+ %t1 = xor i32 %t0, -1
+ %t2 = or i32 %t1, %a
+ ret i32 %t2
+}
+
+define i64 @test_x86_tbm_blci_u64(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blci_u64:
+; CHECK: # BB#0:
+; CHECK-NEXT: blci %rdi, %rax
+; CHECK-NEXT: retq
+ %t0 = add i64 1, %a
+ %t1 = xor i64 %t0, -1
+ %t2 = or i64 %t1, %a
+ ret i64 %t2
+}
+
+define i32 @test_x86_tbm_blci_u32_b(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blci_u32_b:
+; CHECK: # BB#0:
+; CHECK-NEXT: blci %edi, %eax
+; CHECK-NEXT: retq
+ %t0 = sub i32 -2, %a
+ %t1 = or i32 %t0, %a
+ ret i32 %t1
+}
+
+define i64 @test_x86_tbm_blci_u64_b(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blci_u64_b:
+; CHECK: # BB#0:
+; CHECK-NEXT: blci %rdi, %rax
+; CHECK-NEXT: retq
+ %t0 = sub i64 -2, %a
+ %t1 = or i64 %t0, %a
+ ret i64 %t1
+}
+
+define i32 @test_x86_tbm_blcic_u32(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcic_u32:
+; CHECK: # BB#0:
+; CHECK-NEXT: blcic %edi, %eax
+; CHECK-NEXT: retq
+ %t0 = xor i32 %a, -1
+ %t1 = add i32 %a, 1
+ %t2 = and i32 %t1, %t0
+ ret i32 %t2
+}
+
+define i64 @test_x86_tbm_blcic_u64(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcic_u64:
+; CHECK: # BB#0:
+; CHECK-NEXT: blcic %rdi, %rax
+; CHECK-NEXT: retq
+ %t0 = xor i64 %a, -1
+ %t1 = add i64 %a, 1
+ %t2 = and i64 %t1, %t0
+ ret i64 %t2
+}
+
+define i32 @test_x86_tbm_blcmsk_u32(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcmsk_u32:
+; CHECK: # BB#0:
+; CHECK-NEXT: blcmsk %edi, %eax
+; CHECK-NEXT: retq
+ %t0 = add i32 %a, 1
+ %t1 = xor i32 %t0, %a
+ ret i32 %t1
+}
+
+define i64 @test_x86_tbm_blcmsk_u64(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcmsk_u64:
+; CHECK: # BB#0:
+; CHECK-NEXT: blcmsk %rdi, %rax
+; CHECK-NEXT: retq
+ %t0 = add i64 %a, 1
+ %t1 = xor i64 %t0, %a
+ ret i64 %t1
+}
+
+define i32 @test_x86_tbm_blcs_u32(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcs_u32:
+; CHECK: # BB#0:
+; CHECK-NEXT: blcs %edi, %eax
+; CHECK-NEXT: retq
+ %t0 = add i32 %a, 1
+ %t1 = or i32 %t0, %a
+ ret i32 %t1
+}
+
+define i64 @test_x86_tbm_blcs_u64(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcs_u64:
+; CHECK: # BB#0:
+; CHECK-NEXT: blcs %rdi, %rax
+; CHECK-NEXT: retq
+ %t0 = add i64 %a, 1
+ %t1 = or i64 %t0, %a
+ ret i64 %t1
+}
+
+define i32 @test_x86_tbm_blsfill_u32(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blsfill_u32:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsfill %edi, %eax
+; CHECK-NEXT: retq
+ %t0 = add i32 %a, -1
+ %t1 = or i32 %t0, %a
+ ret i32 %t1
+}
+
+define i64 @test_x86_tbm_blsfill_u64(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blsfill_u64:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsfill %rdi, %rax
+; CHECK-NEXT: retq
+ %t0 = add i64 %a, -1
+ %t1 = or i64 %t0, %a
+ ret i64 %t1
+}
+
+define i32 @test_x86_tbm_blsic_u32(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blsic_u32:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsic %edi, %eax
+; CHECK-NEXT: retq
+ %t0 = xor i32 %a, -1
+ %t1 = add i32 %a, -1
+ %t2 = or i32 %t0, %t1
+ ret i32 %t2
+}
+
+define i64 @test_x86_tbm_blsic_u64(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_blsic_u64:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsic %rdi, %rax
+; CHECK-NEXT: retq
+ %t0 = xor i64 %a, -1
+ %t1 = add i64 %a, -1
+ %t2 = or i64 %t0, %t1
+ ret i64 %t2
+}
+
+define i32 @test_x86_tbm_t1mskc_u32(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_t1mskc_u32:
+; CHECK: # BB#0:
+; CHECK-NEXT: t1mskc %edi, %eax
+; CHECK-NEXT: retq
+ %t0 = xor i32 %a, -1
+ %t1 = add i32 %a, 1
+ %t2 = or i32 %t0, %t1
+ ret i32 %t2
+}
+
+define i64 @Ttest_x86_tbm_t1mskc_u64(i64 %a) nounwind {
+; CHECK-LABEL: Ttest_x86_tbm_t1mskc_u64:
+; CHECK: # BB#0:
+; CHECK-NEXT: t1mskc %rdi, %rax
+; CHECK-NEXT: retq
+ %t0 = xor i64 %a, -1
+ %t1 = add i64 %a, 1
+ %t2 = or i64 %t0, %t1
+ ret i64 %t2
+}
+
+define i32 @test_x86_tbm_tzmsk_u32(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_tzmsk_u32:
+; CHECK: # BB#0:
+; CHECK-NEXT: tzmsk %edi, %eax
+; CHECK-NEXT: retq
+ %t0 = xor i32 %a, -1
+ %t1 = add i32 %a, -1
+ %t2 = and i32 %t0, %t1
+ ret i32 %t2
+}
+
+define i64 @test_x86_tbm_tzmsk_u64(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_tzmsk_u64:
+; CHECK: # BB#0:
+; CHECK-NEXT: tzmsk %rdi, %rax
+; CHECK-NEXT: retq
+ %t0 = xor i64 %a, -1
+ %t1 = add i64 %a, -1
+ %t2 = and i64 %t0, %t1
+ ret i64 %t2
}
+
diff --git a/test/CodeGen/X86/vec-copysign.ll b/test/CodeGen/X86/vec-copysign.ll
index d363dbdaef81..1ebd7ceafced 100644
--- a/test/CodeGen/X86/vec-copysign.ll
+++ b/test/CodeGen/X86/vec-copysign.ll
@@ -1,7 +1,7 @@
; RUN: llc < %s -mtriple=x86_64-apple-macosx10.10.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 --check-prefix=CHECK
; RUN: llc < %s -mtriple=x86_64-apple-macosx10.10.0 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=CHECK
-; Assertions have been enhanced from utils/update_test_checks.py to show the constant pool values.
+; Assertions have been enhanced from utils/update_llc_test_checks.py to show the constant pool values.
; Use a macosx triple to make sure the format of those constant strings is exact.
; CHECK: [[SIGNMASK1:L.+]]:
diff --git a/test/CodeGen/X86/vec_return.ll b/test/CodeGen/X86/vec_return.ll
index f7fcd032cab3..556e32d0c87b 100644
--- a/test/CodeGen/X86/vec_return.ll
+++ b/test/CodeGen/X86/vec_return.ll
@@ -1,16 +1,21 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s
; Without any typed operations, always use the smaller xorps.
-; CHECK: test
-; CHECK: xorps
define <2 x double> @test() {
+; CHECK-LABEL: test:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: retl
ret <2 x double> zeroinitializer
}
; Prefer a constant pool load here.
-; CHECK: test2
-; CHECK-NOT: shuf
-; CHECK: movaps {{.*}}{{CPI|__xmm@}}
define <4 x i32> @test2() nounwind {
+; CHECK-LABEL: test2:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,0,1,0]
+; CHECK-NEXT: retl
ret <4 x i32> < i32 0, i32 0, i32 1, i32 0 >
}
+
diff --git a/test/CodeGen/X86/vec_shift6.ll b/test/CodeGen/X86/vec_shift6.ll
index b4a58deff2f8..731760a4ea55 100644
--- a/test/CodeGen/X86/vec_shift6.ll
+++ b/test/CodeGen/X86/vec_shift6.ll
@@ -153,14 +153,16 @@ define <32 x i16> @test7(<32 x i16> %a) {
;
; AVX2-LABEL: test7:
; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: test7:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
+; AVX512-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX512-NEXT: retq
@@ -183,7 +185,8 @@ define <16 x i32> @test8(<16 x i32> %a) {
;
; AVX2-LABEL: test8:
; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsllvd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
diff --git a/test/CodeGen/X86/vec_unsafe-fp-math.ll b/test/CodeGen/X86/vec_unsafe-fp-math.ll
index 1c352782fca4..745316effc98 100644
--- a/test/CodeGen/X86/vec_unsafe-fp-math.ll
+++ b/test/CodeGen/X86/vec_unsafe-fp-math.ll
@@ -1,13 +1,13 @@
-; RUN: llc < %s -enable-unsafe-fp-math -enable-no-signed-zeros-fp-math -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -enable-unsafe-fp-math -enable-no-signed-zeros-fp-math -mtriple=x86_64-unknown-unknown | FileCheck %s
; Make sure that vectors get the same benefits as scalars when using unsafe-fp-math.
; Subtracting zero is free.
define <4 x float> @vec_fsub_zero(<4 x float> %x) {
; CHECK-LABEL: vec_fsub_zero:
-; CHECK-NOT: subps
-; CHECK-NOT: xorps
-; CHECK: retq
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
%sub = fsub <4 x float> %x, zeroinitializer
ret <4 x float> %sub
}
@@ -15,9 +15,10 @@ define <4 x float> @vec_fsub_zero(<4 x float> %x) {
; Negating doesn't require subtraction.
define <4 x float> @vec_fneg(<4 x float> %x) {
; CHECK-LABEL: vec_fneg:
-; CHECK: xorps {{.*}}LCP{{.*}}, %xmm0
-; CHECK-NOT: subps
-; CHECK-NEXT: retq
+; CHECK: # BB#0:
+; CHECK-NEXT: xorps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: retq
%sub = fsub <4 x float> zeroinitializer, %x
ret <4 x float> %sub
}
+
diff --git a/test/CodeGen/X86/vector-popcnt-128.ll b/test/CodeGen/X86/vector-popcnt-128.ll
index adda108bdc77..d2f33785530b 100644
--- a/test/CodeGen/X86/vector-popcnt-128.ll
+++ b/test/CodeGen/X86/vector-popcnt-128.ll
@@ -344,20 +344,43 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: testv8i16:
-; AVX: # BB#0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsllw $8, %xmm0, %xmm1
-; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: testv8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv8i16:
+; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vzeroupper
+; AVX512VPOPCNTDQ-NEXT: retq
%out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in)
ret <8 x i16> %out
}
@@ -431,17 +454,37 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: testv16i8:
-; AVX: # BB#0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: testv16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv16i8:
+; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vzeroupper
+; AVX512VPOPCNTDQ-NEXT: retq
%out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in)
ret <16 x i8> %out
}
diff --git a/test/CodeGen/X86/vector-popcnt-256.ll b/test/CodeGen/X86/vector-popcnt-256.ll
index accbad35e9d7..4c5de2fed385 100644
--- a/test/CodeGen/X86/vector-popcnt-256.ll
+++ b/test/CodeGen/X86/vector-popcnt-256.ll
@@ -155,17 +155,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
;
; AVX512VPOPCNTDQ-LABEL: testv16i16:
; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
%out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in)
ret <16 x i16> %out
diff --git a/test/CodeGen/X86/vector-popcnt-512.ll b/test/CodeGen/X86/vector-popcnt-512.ll
index aa50206e7a5e..a6f4e3342897 100644
--- a/test/CodeGen/X86/vector-popcnt-512.ll
+++ b/test/CodeGen/X86/vector-popcnt-512.ll
@@ -1,11 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VPOPCNTDQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VPOPCNTDQ --check-prefix=AVX512VPOPCNTDQ-NOBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VPOPCNTDQ --check-prefix=AVX512VPOPCNTDQ-BW
define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512F-LABEL: testv8i64:
-; AVX512F: ## BB#0:
+; AVX512F: # BB#0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
@@ -28,7 +29,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: testv8i64:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -42,7 +43,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv8i64:
-; AVX512VPOPCNTDQ: ## BB#0:
+; AVX512VPOPCNTDQ: # BB#0:
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
%out = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %in)
@@ -51,7 +52,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512F-LABEL: testv16i32:
-; AVX512F: ## BB#0:
+; AVX512F: # BB#0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
@@ -82,7 +83,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: testv16i32:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -100,7 +101,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv16i32:
-; AVX512VPOPCNTDQ: ## BB#0:
+; AVX512VPOPCNTDQ: # BB#0:
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
%out = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %in)
@@ -109,7 +110,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512F-LABEL: testv32i16:
-; AVX512F: ## BB#0:
+; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -133,7 +134,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: testv32i16:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -147,36 +148,37 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
-; AVX512VPOPCNTDQ-LABEL: testv32i16:
-; AVX512VPOPCNTDQ: ## BB#0:
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: retq
+; AVX512VPOPCNTDQ-NOBW-LABEL: testv32i16:
+; AVX512VPOPCNTDQ-NOBW: # BB#0:
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: retq
+;
+; AVX512VPOPCNTDQ-BW-LABEL: testv32i16:
+; AVX512VPOPCNTDQ-BW: # BB#0:
+; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512VPOPCNTDQ-BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1
+; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-BW-NEXT: retq
%out = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %in)
ret <32 x i16> %out
}
define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512F-LABEL: testv64i8:
-; AVX512F: ## BB#0:
+; AVX512F: # BB#0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -194,7 +196,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: testv64i8:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -205,23 +207,35 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
-; AVX512VPOPCNTDQ-LABEL: testv64i8:
-; AVX512VPOPCNTDQ: ## BB#0:
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: retq
+; AVX512VPOPCNTDQ-NOBW-LABEL: testv64i8:
+; AVX512VPOPCNTDQ-NOBW: # BB#0:
+; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: retq
+;
+; AVX512VPOPCNTDQ-BW-LABEL: testv64i8:
+; AVX512VPOPCNTDQ-BW: # BB#0:
+; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512VPOPCNTDQ-BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-BW-NEXT: retq
%out = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %in)
ret <64 x i8> %out
}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll b/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll
new file mode 100644
index 000000000000..af69a5ac2283
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,+sse4a| FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+;
+; Combine tests involving SSE4A target shuffles (EXTRQI,INSERTQI)
+
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @combine_extrqi_pshufb_16i8(<16 x i8> %a0) {
+; ALL-LABEL: combine_extrqi_pshufb_16i8:
+; ALL: # BB#0:
+; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[1,2],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 2, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 255, i8 255, i8 255, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+ ret <16 x i8> %2
+}
+
+define <8 x i16> @combine_extrqi_pshufb_8i16(<8 x i16> %a0) {
+; ALL-LABEL: combine_extrqi_pshufb_8i16:
+; ALL: # BB#0:
+; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; ALL-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 2, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = bitcast <8 x i16> %1 to <16 x i8>
+ %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+ %4 = bitcast <16 x i8> %3 to <8 x i16>
+ ret <8 x i16> %4
+}
+
+define <16 x i8> @combine_insertqi_pshufb_16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; SSSE3-LABEL: combine_insertqi_pshufb_16i8:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: extrq {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE42-LABEL: combine_insertqi_pshufb_16i8:
+; SSE42: # BB#0:
+; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: combine_insertqi_pshufb_16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+ ret <16 x i8> %2
+}
+
+define <8 x i16> @combine_insertqi_pshufb_8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; SSSE3-LABEL: combine_insertqi_pshufb_8i16:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: extrq {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE42-LABEL: combine_insertqi_pshufb_8i16:
+; SSE42: # BB#0:
+; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: combine_insertqi_pshufb_8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = bitcast <8 x i16> %1 to <16 x i8>
+ %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+ %4 = bitcast <16 x i8> %3 to <8 x i16>
+ ret <8 x i16> %4
+}
+
+define <16 x i8> @combine_pshufb_insertqi_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
+; ALL-LABEL: combine_pshufb_insertqi_pshufb:
+; ALL: # BB#0:
+; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0],xmm1[0,1],xmm0[3,4,5,6,7,u,u,u,u,u,u,u,u]
+; ALL-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+ %2 = shufflevector <16 x i8> %1, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 17, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 7, i8 1, i8 2, i8 4, i8 3, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+ ret <16 x i8> %3
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 546b73126039..02314857c6d7 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -445,6 +445,21 @@ define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) {
ret <16 x i8> %res1
}
+define <16 x i8> @combine_vpshufb_as_pshuflw_not_pslld(<16 x i8> *%a0) {
+; SSE-LABEL: combine_vpshufb_as_pshuflw_not_pslld:
+; SSE: # BB#0:
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_vpshufb_as_pshuflw_not_pslld:
+; AVX: # BB#0:
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7]
+; AVX-NEXT: retq
+ %res0 = load <16 x i8>, <16 x i8> *%a0, align 16
+ %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 undef, i8 undef, i8 0, i8 1, i8 undef, i8 undef, i8 4, i8 5, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+ ret <16 x i8> %res1
+}
+
define <16 x i8> @combine_pshufb_as_unary_unpcklbw(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_unary_unpcklbw:
; SSE: # BB#0:
diff --git a/test/CodeGen/X86/vector-shuffle-sse4a.ll b/test/CodeGen/X86/vector-shuffle-sse4a.ll
index 138c421215f4..e458bb6fa52f 100644
--- a/test/CodeGen/X86/vector-shuffle-sse4a.ll
+++ b/test/CodeGen/X86/vector-shuffle-sse4a.ll
@@ -1,4 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=AMD10H
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=BTVER1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=BTVER2
@@ -10,7 +11,6 @@
define <2 x i64> @extrqi_len0_idx0(<2 x i64> %a) {
; ALL-LABEL: extrqi_len0_idx0:
; ALL: # BB#0:
-; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a, i8 0, i8 0)
ret <2 x i64> %1
@@ -36,6 +36,11 @@ define <2 x i64> @extrqi_len32_idx48(<2 x i64> %a) {
}
define <16 x i8> @shuf_0zzzuuuuuuuuuuuu(<16 x i8> %a0) {
+; AMD10H-LABEL: shuf_0zzzuuuuuuuuuuuu:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: retq
+;
; BTVER1-LABEL: shuf_0zzzuuuuuuuuuuuu:
; BTVER1: # BB#0:
; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
@@ -50,12 +55,17 @@ define <16 x i8> @shuf_0zzzuuuuuuuuuuuu(<16 x i8> %a0) {
}
define <16 x i8> @shuf_0zzzzzzz1zzzzzzz(<16 x i8> %a0) {
+; AMD10H-LABEL: shuf_0zzzzzzz1zzzzzzz:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: movdqa %xmm0, %xmm1
+; AMD10H-NEXT: extrq {{.*#+}} xmm1 = xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AMD10H-NEXT: retq
+;
; BTVER1-LABEL: shuf_0zzzzzzz1zzzzzzz:
; BTVER1: # BB#0:
-; BTVER1-NEXT: movdqa %xmm0, %xmm1
-; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: shuf_0zzzzzzz1zzzzzzz:
@@ -67,12 +77,17 @@ define <16 x i8> @shuf_0zzzzzzz1zzzzzzz(<16 x i8> %a0) {
}
define <16 x i8> @shuf_2zzzzzzz3zzzzzzz(<16 x i8> %a0) {
+; AMD10H-LABEL: shuf_2zzzzzzz3zzzzzzz:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: movdqa %xmm0, %xmm1
+; AMD10H-NEXT: extrq {{.*#+}} xmm1 = xmm1[3],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AMD10H-NEXT: retq
+;
; BTVER1-LABEL: shuf_2zzzzzzz3zzzzzzz:
; BTVER1: # BB#0:
-; BTVER1-NEXT: movdqa %xmm0, %xmm1
-; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[3],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: shuf_2zzzzzzz3zzzzzzz:
@@ -85,6 +100,11 @@ define <16 x i8> @shuf_2zzzzzzz3zzzzzzz(<16 x i8> %a0) {
}
define <16 x i8> @shuf_01zzuuuuuuuuuuuu(<16 x i8> %a0) {
+; AMD10H-LABEL: shuf_01zzuuuuuuuuuuuu:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: retq
+;
; BTVER1-LABEL: shuf_01zzuuuuuuuuuuuu:
; BTVER1: # BB#0:
; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
@@ -99,12 +119,17 @@ define <16 x i8> @shuf_01zzuuuuuuuuuuuu(<16 x i8> %a0) {
}
define <16 x i8> @shuf_01zzzzzz23zzzzzz(<16 x i8> %a0) {
+; AMD10H-LABEL: shuf_01zzzzzz23zzzzzz:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: movdqa %xmm0, %xmm1
+; AMD10H-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AMD10H-NEXT: retq
+;
; BTVER1-LABEL: shuf_01zzzzzz23zzzzzz:
; BTVER1: # BB#0:
-; BTVER1-NEXT: movdqa %xmm0, %xmm1
-; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[2,3],zero,zero,zero,zero,zero,zero
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: shuf_01zzzzzz23zzzzzz:
@@ -143,21 +168,37 @@ define <8 x i16> @shuf_12zzuuuu(<8 x i16> %a0) {
}
define <8 x i16> @shuf_012zuuuu(<8 x i16> %a0) {
-; ALL-LABEL: shuf_012zuuuu:
-; ALL: # BB#0:
-; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; ALL-NEXT: retq
+; AMD10H-LABEL: shuf_012zuuuu:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: retq
+;
+; BTVER1-LABEL: shuf_012zuuuu:
+; BTVER1: # BB#0:
+; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; BTVER1-NEXT: retq
+;
+; BTVER2-LABEL: shuf_012zuuuu:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BTVER2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
+; BTVER2-NEXT: retq
%s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %s
}
define <8 x i16> @shuf_0zzz1zzz(<8 x i16> %a0) {
+; AMD10H-LABEL: shuf_0zzz1zzz:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: movdqa %xmm0, %xmm1
+; AMD10H-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AMD10H-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AMD10H-NEXT: retq
+;
; BTVER1-LABEL: shuf_0zzz1zzz:
; BTVER1: # BB#0:
-; BTVER1-NEXT: movdqa %xmm0, %xmm1
-; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[2,3],zero,zero,zero,zero,zero,zero
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: shuf_0zzz1zzz:
@@ -169,6 +210,12 @@ define <8 x i16> @shuf_0zzz1zzz(<8 x i16> %a0) {
}
define <4 x i32> @shuf_0z1z(<4 x i32> %a0) {
+; AMD10H-LABEL: shuf_0z1z:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: pxor %xmm1, %xmm1
+; AMD10H-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AMD10H-NEXT: retq
+;
; BTVER1-LABEL: shuf_0z1z:
; BTVER1: # BB#0:
; BTVER1-NEXT: pxor %xmm1, %xmm1
@@ -189,10 +236,20 @@ define <4 x i32> @shuf_0z1z(<4 x i32> %a0) {
; A length of zero is equivalent to a bit length of 64.
define <2 x i64> @insertqi_len0_idx0(<2 x i64> %a, <2 x i64> %b) {
-; ALL-LABEL: insertqi_len0_idx0:
-; ALL: # BB#0:
-; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6,7],xmm0[u,u,u,u,u,u,u,u]
-; ALL-NEXT: retq
+; AMD10H-LABEL: insertqi_len0_idx0:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: movaps %xmm1, %xmm0
+; AMD10H-NEXT: retq
+;
+; BTVER1-LABEL: insertqi_len0_idx0:
+; BTVER1: # BB#0:
+; BTVER1-NEXT: movaps %xmm1, %xmm0
+; BTVER1-NEXT: retq
+;
+; BTVER2-LABEL: insertqi_len0_idx0:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vmovaps %xmm1, %xmm0
+; BTVER2-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a, <2 x i64> %b, i8 0, i8 0)
ret <2 x i64> %1
}
@@ -303,6 +360,15 @@ define <8 x i16> @shuf_089uuuuu(<8 x i16> %a0, <8 x i16> %a1) {
; Out of range.
define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) {
+; AMD10H-LABEL: shuffle_8_18_uuuuuuuuuuuuuu:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AMD10H-NEXT: andpd {{.*}}(%rip), %xmm0
+; AMD10H-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AMD10H-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; AMD10H-NEXT: packuswb %xmm0, %xmm0
+; AMD10H-NEXT: retq
+;
; BTVER1-LABEL: shuffle_8_18_uuuuuuuuuuuuuu:
; BTVER1: # BB#0:
; BTVER1-NEXT: psrld $16, %xmm1
@@ -321,6 +387,13 @@ define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) {
}
define <16 x i8> @shuffle_uu_0_5_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %v) {
+; AMD10H-LABEL: shuffle_uu_0_5_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AMD10H-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AMD10H-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; AMD10H-NEXT: retq
+;
; BTVER1-LABEL: shuffle_uu_0_5_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
; BTVER1: # BB#0:
; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,5,5,4,4,5,5,4,4,5,5,6,6,7,7]
@@ -335,6 +408,12 @@ define <16 x i8> @shuffle_uu_0_5_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i8
}
define <16 x i8> @shuffle_uu_16_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %v) {
+; AMD10H-LABEL: shuffle_uu_16_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AMD10H: # BB#0:
+; AMD10H-NEXT: psrlq $16, %xmm0
+; AMD10H-NEXT: pand {{.*}}(%rip), %xmm0
+; AMD10H-NEXT: retq
+;
; BTVER1-LABEL: shuffle_uu_16_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
; BTVER1: # BB#0:
; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u],zero,xmm0[4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
diff --git a/test/CodeGen/X86/vector-truncate-combine.ll b/test/CodeGen/X86/vector-truncate-combine.ll
index 1a6dac8fa6e4..61808b802517 100644
--- a/test/CodeGen/X86/vector-truncate-combine.ll
+++ b/test/CodeGen/X86/vector-truncate-combine.ll
@@ -11,14 +11,14 @@
; preservation of the extend/truncate operations mentioned above (2 extend and
; 3 truncate instructions).
;
-; NOTE: This operation could be collapsed in to a single truncate. Once that is done
-; this test will have to be adjusted.
+; NOTE: This operation is collapsed to a single truncate, so this test no longer covers
+; what it originally intended to.
-; CHECK: PUNPCKLBWrr
-; CHECK: PUNPCKLWDrr
-; CHECK: PACKUSWBrr
+; CHECK: MOVLHPSrr
+; CHECK: PSHUFHWri
; CHECK: PACKUSWBrr
; CHECK: PACKUSWBrr
+; CHECK: MOVPDI2DIrr
define void @test(double %vec.coerce) local_unnamed_addr {
entry:
diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll
index 4b5a00a30d09..820178d2d992 100644
--- a/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -928,17 +928,10 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %xmm0, %xmm1
-; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
;
; X32-SSE-LABEL: testv8i16:
@@ -1095,17 +1088,10 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %xmm0, %xmm1
-; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
;
; X32-SSE-LABEL: testv8i16u:
@@ -1243,14 +1229,10 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
;
; X32-SSE-LABEL: testv16i8:
@@ -1384,14 +1366,10 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
;
; X32-SSE-LABEL: testv16i8u:
diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll
index 16192ec61a55..30e5661d5485 100644
--- a/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -584,17 +584,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
; X32-AVX-LABEL: testv16i16:
@@ -722,17 +714,9 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
; X32-AVX-LABEL: testv16i16u:
diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll
index 760216d561c4..3bf677aadf19 100644
--- a/test/CodeGen/X86/vector-tzcnt-512.ll
+++ b/test/CodeGen/X86/vector-tzcnt-512.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,-avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CDBW
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=-avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CDBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ
define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512CD-LABEL: testv8i64:
-; AVX512CD: ## BB#0:
+; AVX512CD: # BB#0:
; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm1
; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -34,7 +34,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv8i64:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # BB#0:
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm2
; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0
@@ -52,7 +52,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv8i64:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubq %zmm0, %zmm1, %zmm2
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
@@ -70,7 +70,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv8i64:
-; AVX512VPOPCNTDQ: ## BB#0:
+; AVX512VPOPCNTDQ: # BB#0:
; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -84,7 +84,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512CD-LABEL: testv8i64u:
-; AVX512CD: ## BB#0:
+; AVX512CD: # BB#0:
; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm1
; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -94,7 +94,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv8i64u:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # BB#0:
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -104,7 +104,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv8i64u:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubq %zmm0, %zmm1, %zmm2
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
@@ -122,7 +122,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv8i64u:
-; AVX512VPOPCNTDQ: ## BB#0:
+; AVX512VPOPCNTDQ: # BB#0:
; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -136,7 +136,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512CD-LABEL: testv16i32:
-; AVX512CD: ## BB#0:
+; AVX512CD: # BB#0:
; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1
; AVX512CD-NEXT: vpandd %zmm1, %zmm0, %zmm0
@@ -172,7 +172,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv16i32:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # BB#0:
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm2
; AVX512CDBW-NEXT: vpandd %zmm2, %zmm0, %zmm0
@@ -194,7 +194,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv16i32:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubd %zmm0, %zmm1, %zmm2
; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0
@@ -216,7 +216,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv16i32:
-; AVX512VPOPCNTDQ: ## BB#0:
+; AVX512VPOPCNTDQ: # BB#0:
; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpandd %zmm1, %zmm0, %zmm0
@@ -230,7 +230,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; AVX512CD-LABEL: testv16i32u:
-; AVX512CD: ## BB#0:
+; AVX512CD: # BB#0:
; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1
; AVX512CD-NEXT: vpandd %zmm1, %zmm0, %zmm0
@@ -240,7 +240,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv16i32u:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # BB#0:
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandd %zmm1, %zmm0, %zmm0
@@ -250,7 +250,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv16i32u:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubd %zmm0, %zmm1, %zmm2
; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0
@@ -272,7 +272,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv16i32u:
-; AVX512VPOPCNTDQ: ## BB#0:
+; AVX512VPOPCNTDQ: # BB#0:
; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpandd %zmm1, %zmm0, %zmm0
@@ -286,7 +286,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CD-LABEL: testv32i16:
-; AVX512CD: ## BB#0:
+; AVX512CD: # BB#0:
; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512CD-NEXT: vpsubw %ymm0, %ymm2, %ymm3
; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
@@ -318,7 +318,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv32i16:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # BB#0:
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -338,7 +338,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv32i16:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -358,35 +358,21 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv32i16:
-; AVX512VPOPCNTDQ: ## BB#0:
+; AVX512VPOPCNTDQ: # BB#0:
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm5
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm5, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: retq
%out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 0)
ret <32 x i16> %out
@@ -394,7 +380,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CD-LABEL: testv32i16u:
-; AVX512CD: ## BB#0:
+; AVX512CD: # BB#0:
; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512CD-NEXT: vpsubw %ymm0, %ymm2, %ymm3
; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
@@ -426,7 +412,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv32i16u:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # BB#0:
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -446,7 +432,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv32i16u:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -466,35 +452,21 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv32i16u:
-; AVX512VPOPCNTDQ: ## BB#0:
+; AVX512VPOPCNTDQ: # BB#0:
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5
-; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm5
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm5, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: retq
%out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 -1)
ret <32 x i16> %out
@@ -502,7 +474,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512CD-LABEL: testv64i8:
-; AVX512CD: ## BB#0:
+; AVX512CD: # BB#0:
; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512CD-NEXT: vpsubb %ymm0, %ymm2, %ymm3
; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
@@ -528,7 +500,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv64i8:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # BB#0:
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -545,7 +517,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv64i8:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -562,7 +534,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv64i8:
-; AVX512VPOPCNTDQ: ## BB#0:
+; AVX512VPOPCNTDQ: # BB#0:
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
@@ -592,7 +564,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512CD-LABEL: testv64i8u:
-; AVX512CD: ## BB#0:
+; AVX512CD: # BB#0:
; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512CD-NEXT: vpsubb %ymm0, %ymm2, %ymm3
; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
@@ -618,7 +590,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv64i8u:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # BB#0:
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -635,7 +607,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv64i8u:
-; AVX512BW: ## BB#0:
+; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -652,7 +624,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv64i8u:
-; AVX512VPOPCNTDQ: ## BB#0:
+; AVX512VPOPCNTDQ: # BB#0:
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/wide-integer-cmp.ll b/test/CodeGen/X86/wide-integer-cmp.ll
index b5c7f86567a1..182d7cc73c9a 100644
--- a/test/CodeGen/X86/wide-integer-cmp.ll
+++ b/test/CodeGen/X86/wide-integer-cmp.ll
@@ -101,8 +101,8 @@ define i32 @test_wide(i128 %a, i128 %b) {
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: jge .LBB4_2
; CHECK-NEXT: # BB#1: # %bb1
; CHECK-NEXT: movl $1, %eax
diff --git a/test/CodeGen/X86/x32-lea-1.ll b/test/CodeGen/X86/x32-lea-1.ll
index 2f7d71e2baf1..afe3581a85bc 100644
--- a/test/CodeGen/X86/x32-lea-1.ll
+++ b/test/CodeGen/X86/x32-lea-1.ll
@@ -1,10 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -O0 | FileCheck %s
-; CHECK: leal {{[-0-9]*}}(%r{{s|b}}p),
-; CHECK-NOT: leal {{[-0-9]*}}(%e{{s|b}}p),
define void @foo(i32** %p) {
+; CHECK-LABEL: foo:
+; CHECK: # BB#0:
+; CHECK-NEXT: leal -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: addl $16, %eax
+; CHECK-NEXT: movl %eax, (%edi)
+; CHECK-NEXT: retq
%a = alloca i32, i32 10
%addr = getelementptr i32, i32* %a, i32 4
store i32* %addr, i32** %p
ret void
}
+
diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll
index 1263605a6dc0..5f85975fdb5c 100644
--- a/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1,9 +1,26 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: llc -mtriple=x86_64-pc-linux -mattr=+avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc -mtriple=x86_64-pc-linux -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX3
define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
+; AVX1-LABEL: load_factorf64_4:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovupd (%rdi), %ymm0
+; AVX1-NEXT: vmovupd 32(%rdi), %ymm1
+; AVX1-NEXT: vmovupd 64(%rdi), %ymm2
+; AVX1-NEXT: vmovupd 96(%rdi), %ymm3
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
+; AVX1-NEXT: vhaddpd %ymm5, %ymm4, %ymm4
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX1-NEXT: vaddpd %ymm2, %ymm4, %ymm2
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: retq
+;
; AVX-LABEL: load_factorf64_4:
; AVX: # BB#0:
; AVX-NEXT: vmovupd (%rdi), %ymm0
@@ -32,6 +49,21 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
}
define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
+; AVX1-LABEL: load_factorf64_2:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovupd (%rdi), %ymm0
+; AVX1-NEXT: vmovupd 32(%rdi), %ymm1
+; AVX1-NEXT: vmovupd 64(%rdi), %ymm2
+; AVX1-NEXT: vmovupd 96(%rdi), %ymm3
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX1-NEXT: vmulpd %ymm0, %ymm4, %ymm0
+; AVX1-NEXT: retq
+;
; AVX-LABEL: load_factorf64_2:
; AVX: # BB#0:
; AVX-NEXT: vmovupd (%rdi), %ymm0
@@ -54,6 +86,16 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
}
define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
+; AVX1-LABEL: load_factorf64_1:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovupd (%rdi), %ymm0
+; AVX1-NEXT: vmovupd 32(%rdi), %ymm1
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],mem[0,1]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],mem[0,1]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX1-NEXT: vmulpd %ymm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
; AVX-LABEL: load_factorf64_1:
; AVX: # BB#0:
; AVX-NEXT: vmovupd (%rdi), %ymm0
@@ -98,24 +140,24 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: load_factori64_4:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
-; AVX2-NEXT: vmovdqu 64(%rdi), %ymm2
-; AVX2-NEXT: vmovdqu 96(%rdi), %ymm3
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
-; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3
-; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: retq
+; AVX-LABEL: load_factori64_4:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqu (%rdi), %ymm0
+; AVX-NEXT: vmovdqu 32(%rdi), %ymm1
+; AVX-NEXT: vmovdqu 64(%rdi), %ymm2
+; AVX-NEXT: vmovdqu 96(%rdi), %ymm3
+; AVX-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
+; AVX-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
+; AVX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
+; AVX-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
+; AVX-NEXT: vpaddq %ymm3, %ymm4, %ymm3
+; AVX-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX-NEXT: retq
%wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16
%strided.v0 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
%strided.v1 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
@@ -128,6 +170,23 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
}
define void @store_factorf64_4(<16 x double>* %ptr, <4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double> %v3) {
+; AVX1-LABEL: store_factorf64_4:
+; AVX1: # BB#0:
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
+; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
+; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX1-NEXT: vmovupd %ymm0, 96(%rdi)
+; AVX1-NEXT: vmovupd %ymm3, 64(%rdi)
+; AVX1-NEXT: vmovupd %ymm4, 32(%rdi)
+; AVX1-NEXT: vmovupd %ymm2, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
; AVX-LABEL: store_factorf64_4:
; AVX: # BB#0:
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
@@ -169,22 +228,22 @@ define void @store_factori64_4(<16 x i64>* %ptr, <4 x i64> %v0, <4 x i64> %v1, <
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: store_factori64_4:
-; AVX2: # BB#0:
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm5
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
-; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi)
-; AVX2-NEXT: vmovdqu %ymm3, 64(%rdi)
-; AVX2-NEXT: vmovdqu %ymm4, 32(%rdi)
-; AVX2-NEXT: vmovdqu %ymm2, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX-LABEL: store_factori64_4:
+; AVX: # BB#0:
+; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4
+; AVX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm5
+; AVX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
+; AVX-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
+; AVX-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX-NEXT: vmovdqu %ymm0, 96(%rdi)
+; AVX-NEXT: vmovdqu %ymm3, 64(%rdi)
+; AVX-NEXT: vmovdqu %ymm4, 32(%rdi)
+; AVX-NEXT: vmovdqu %ymm2, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%s1 = shufflevector <4 x i64> %v2, <4 x i64> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%interleaved.vec = shufflevector <8 x i64> %s0, <8 x i64> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
@@ -252,54 +311,54 @@ define void @interleaved_store_vf32_i8_stride4(<32 x i8> %x1, <32 x i8> %x2, <32
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: interleaved_store_vf32_i8_stride4:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
-; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7],ymm5[8],ymm4[9],ymm5[10],ymm4[11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
-; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
-; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7],ymm6[8],ymm5[9],ymm6[10],ymm5[11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4
-; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3],ymm4[4],ymm6[5],ymm4[6],ymm6[7],ymm4[8],ymm6[9],ymm4[10],ymm6[11],ymm4[12],ymm6[13],ymm4[14],ymm6[15]
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
-; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
-; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi)
-; AVX2-NEXT: vmovdqa %ymm4, 64(%rdi)
-; AVX2-NEXT: vmovdqa %ymm5, 32(%rdi)
-; AVX2-NEXT: vmovdqa %ymm8, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX-LABEL: interleaved_store_vf32_i8_stride4:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; AVX-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; AVX-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; AVX-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7],ymm5[8],ymm4[9],ymm5[10],ymm4[11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; AVX-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
+; AVX-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7],ymm6[8],ymm5[9],ymm6[10],ymm5[11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
+; AVX-NEXT: vextracti128 $1, %ymm3, %xmm3
+; AVX-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; AVX-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
+; AVX-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; AVX-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
+; AVX-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4
+; AVX-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3],ymm4[4],ymm6[5],ymm4[6],ymm6[7],ymm4[8],ymm6[9],ymm4[10],ymm6[11],ymm4[12],ymm6[13],ymm4[14],ymm6[15]
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
+; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX-NEXT: vmovdqa %ymm0, 96(%rdi)
+; AVX-NEXT: vmovdqa %ymm4, 64(%rdi)
+; AVX-NEXT: vmovdqa %ymm5, 32(%rdi)
+; AVX-NEXT: vmovdqa %ymm8, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%v1 = shufflevector <32 x i8> %x1, <32 x i8> %x2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
%v2 = shufflevector <32 x i8> %x3, <32 x i8> %x4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
%interleaved.vec = shufflevector <64 x i8> %v1, <64 x i8> %v2, <128 x i32> <i32 0, i32 32, i32 64, i32 96, i32 1, i32 33, i32 65, i32 97, i32 2, i32 34, i32 66, i32 98, i32 3, i32 35, i32 67, i32 99, i32 4, i32 36, i32 68, i32 100, i32 5, i32 37, i32 69, i32 101, i32 6, i32 38, i32 70, i32 102, i32 7, i32 39, i32 71, i32 103, i32 8, i32 40, i32 72, i32 104, i32 9, i32 41, i32 73, i32 105, i32 10, i32 42, i32 74, i32 106, i32 11, i32 43, i32 75, i32 107, i32 12, i32 44, i32 76, i32 108, i32 13, i32 45, i32 77, i32 109, i32 14, i32 46, i32 78, i32 110, i32 15, i32 47, i32 79, i32 111, i32 16, i32 48, i32 80, i32 112, i32 17, i32 49, i32 81, i32 113, i32 18, i32 50, i32 82, i32 114, i32 19, i32 51, i32 83, i32 115, i32 20, i32 52, i32 84, i32 116, i32 21, i32 53, i32 85, i32 117, i32 22, i32 54, i32 86, i32 118, i32 23, i32 55, i32 87, i32 119, i32 24, i32 56, i32 88, i32 120, i32 25, i32 57, i32 89, i32 121, i32 26, i32 58, i32 90, i32 122, i32 27, i32 59, i32 91, i32 123, i32 28, i32 60, i32 92, i32 124, i32 29, i32 61, i32 93, i32 125, i32 30, i32 62, i32 94, i32 126, i32 31, i32 63, i32 95, i32 127>
diff --git a/test/CodeGen/X86/zext-shl.ll b/test/CodeGen/X86/zext-shl.ll
index ac3ecc85f2d9..7722f46d753a 100644
--- a/test/CodeGen/X86/zext-shl.ll
+++ b/test/CodeGen/X86/zext-shl.ll
@@ -1,25 +1,26 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s
-define i32 @t1(i8 zeroext %x) nounwind readnone ssp {
-entry:
+define i32 @t1(i8 zeroext %x) nounwind {
; CHECK-LABEL: t1:
-; CHECK: shll
-; CHECK-NOT: movzwl
-; CHECK: ret
- %0 = zext i8 %x to i16
- %1 = shl i16 %0, 5
- %2 = zext i16 %1 to i32
- ret i32 %2
+; CHECK: # BB#0:
+; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: shll $5, %eax
+; CHECK-NEXT: retl
+ %t0 = zext i8 %x to i16
+ %t1 = shl i16 %t0, 5
+ %t2 = zext i16 %t1 to i32
+ ret i32 %t2
}
-define i32 @t2(i8 zeroext %x) nounwind readnone ssp {
-entry:
+define i32 @t2(i8 zeroext %x) nounwind {
; CHECK-LABEL: t2:
-; CHECK: shrl
-; CHECK-NOT: movzwl
-; CHECK: ret
- %0 = zext i8 %x to i16
- %1 = lshr i16 %0, 3
- %2 = zext i16 %1 to i32
- ret i32 %2
+; CHECK: # BB#0:
+; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: shrl $3, %eax
+; CHECK-NEXT: retl
+ %t0 = zext i8 %x to i16
+ %t1 = lshr i16 %t0, 3
+ %t2 = zext i16 %t1 to i32
+ ret i32 %t2
}
diff --git a/test/CodeGen/X86/zext-trunc.ll b/test/CodeGen/X86/zext-trunc.ll
index 32afd6b96a8b..e51a77abc92e 100644
--- a/test/CodeGen/X86/zext-trunc.ll
+++ b/test/CodeGen/X86/zext-trunc.ll
@@ -1,11 +1,12 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
; rdar://7570931
define i64 @foo(i64 %a, i64 %b) nounwind {
; CHECK-LABEL: foo:
-; CHECK: leal
-; CHECK-NOT: movl
-; CHECK: ret
+; CHECK: # BB#0:
+; CHECK-NEXT: leal (%rdi,%rsi), %eax
+; CHECK-NEXT: retq
%c = add i64 %a, %b
%d = trunc i64 %c to i32
%e = zext i32 %d to i64